diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 0047755fcbc3ec26d86b3c24075354534c00ef13..789cef1c8d1cab5708176863b8402655289c1305 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,9 +1,8 @@
-"Python 2.7 AMD CPU (+GL and special func)":
+"Python 2.7 AMD CPU":
   script:
   - export PY_EXE=python2.7
   - export PYOPENCL_TEST=amd:pu
-  - export EXTRA_INSTALL="numpy mako scipy pyfmmlib"
-  - echo "CL_ENABLE_GL = True" > siteconf.py
+  - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   allow_failure: true
@@ -18,7 +17,7 @@ Python 3.6 Intel CPU:
   script:
   - export PY_EXE=python3.6
   - export PYOPENCL_TEST="intel(r):pu"
-  - export EXTRA_INSTALL="numpy mako"
+  - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   allow_failure: true
@@ -32,7 +31,7 @@ Python 3.6 AMD CPU:
   script:
   - export PY_EXE=python3.6
   - export PYOPENCL_TEST=amd:pu
-  - export EXTRA_INSTALL="numpy mako"
+  - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   allow_failure: true
@@ -46,7 +45,7 @@ Python 3.6 Titan X:
   script:
   - export PY_EXE=python3.5
   - export PYOPENCL_TEST=nvi:titan
-  - export EXTRA_INSTALL="numpy mako"
+  - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -59,7 +58,7 @@ Python 3.6 K40:
   script:
   - export PY_EXE=python3.6
   - export PYOPENCL_TEST=nvi:k40
-  - export EXTRA_INSTALL="numpy mako"
+  - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -72,7 +71,11 @@ Python 3.6 AMD GPU:
   script:
   - export PY_EXE=python3.6
   - export PYOPENCL_TEST=amd:fiji
-  - export EXTRA_INSTALL="numpy mako"
+  - export EXTRA_INSTALL="pybind11 numpy mako"
+
+  # https://andreask.cs.illinois.edu/MachineShop/UserNotes
+  - export OCL_ICD_VENDORS=/etc/OpenCLwithAMD/vendors
+
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   allow_failure: true
@@ -82,25 +85,11 @@ Python 3.6 AMD GPU:
   except:
   - tags
 
-Python 2.6 POCL CPU:
-  script:
-  - export PY_EXE=python2.6
-  - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="numpy mako"
-  - export NO_DOCTESTS=1
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
-  tags:
-  - python2.6
-  - amd-cl-cpu
-  except:
-  - tags
-
 Python 2.7 POCL:
   script:
   - export PY_EXE=python2.7
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="numpy mako"
+  - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -113,7 +102,7 @@ Python 3.7 POCL:
   script:
   - export PY_EXE=python3.7
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="numpy mako"
+  - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -126,7 +115,7 @@ Python 3.6 POCL CL 1.1:
   script:
   - export PY_EXE=python3.5
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="numpy mako"
+  - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - echo "CL_PRETEND_VERSION = '1.1'" > siteconf.py
   - ". ./build-and-test-py-project.sh"
@@ -140,7 +129,21 @@ Python 3.6 POCL:
   script:
   - export PY_EXE=python3.6
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="numpy mako"
+  - export EXTRA_INSTALL="pybind11 numpy mako"
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
+  - ". ./build-and-test-py-project.sh"
+  tags:
+  - python3.6
+  - pocl
+  except:
+  - tags
+
+Python 3.6 POCL (+GL and special functions):
+  script:
+  - export PY_EXE=python3.6
+  - export PYOPENCL_TEST=portable
+  - export EXTRA_INSTALL="pybind11 numpy mako scipy pyfmmlib"
+  - echo "CL_ENABLE_GL = True" > siteconf.py
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -153,7 +156,7 @@ Python 2.7 Apple:
   script:
   - export PY_EXE=python2.7
   - export PYOPENCL_TEST=app:cpu
-  - export EXTRA_INSTALL="numpy mako"
+  - export EXTRA_INSTALL="pybind11 numpy mako"
   - export PKG_CONFIG_PATH=/usr/local/opt/libffi/lib/pkgconfig
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
@@ -178,10 +181,14 @@ PyPy POCL:
   script:
   - export PY_EXE=pypy
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="numpy mako"
+
+  # https://github.com/pybind/pybind11/pull/1494
+  - export EXTRA_INSTALL="git+https://github.com/inducer/pybind11 numpy mako"
+
   - export NO_DOCTESTS=1
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
+
   tags:
   - pypy
   - pocl
@@ -190,7 +197,7 @@ PyPy POCL:
 
 Documentation:
   script:
-  - EXTRA_INSTALL="numpy mako"
+  - EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-docs.sh
   - ". ./build-docs.sh"
   tags:
diff --git a/.gitmodules b/.gitmodules
index cb5a4e231848994b7d000813df14d87155613ece..779ec4875e9d74d3b4eef45f6349e551c3258339 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,3 @@
 [submodule "pyopencl/compyte"]
 	path = pyopencl/compyte
 	url = https://github.com/inducer/compyte
-[submodule "src/c_wrapper/mingw-std-threads"]
-	path = src/c_wrapper/mingw-std-threads
-	url = https://github.com/meganz/mingw-std-threads.git
diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml
index a149bbe4cb8008125d3fe2a7dc029445b66a6ab4..b7824b0bb6b3cdbc4070215affce4e3ae7f1751d 100644
--- a/.test-conda-env-py3.yml
+++ b/.test-conda-env-py3.yml
@@ -9,3 +9,4 @@ dependencies:
 - pocl
 - osx-pocl-opencl
 - mako
+- pybind11
diff --git a/cffi_build.py.in b/cffi_build.py.in
deleted file mode 100644
index f948c8247a124ea40b3770c13da5d4fb14c6539c..0000000000000000000000000000000000000000
--- a/cffi_build.py.in
+++ /dev/null
@@ -1,86 +0,0 @@
-from __future__ import absolute_import, print_function
-
-__copyright__ = """
-Copyright (C) 2009-15 Andreas Kloeckner
-Copyright (C) 2013 Marko Bencun
-Copyright (C) 2014 Yuyi Chao
-"""
-
-__license__ = """
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-"""
-
-
-from cffi import FFI
-
-ffi = FFI()
-
-
-with open("cl_types.h", "rt") as f:
-    ffi.cdef(f.read())
-
-if {CL_ENABLE_GL}:
-    with open("cl_gl_types.h") as f:
-        ffi.cdef(f.read())
-
-with open("src/c_wrapper/wrap_cl_core.h", "rt") as f:
-    ffi.cdef(f.read())
-
-if {CL_ENABLE_GL}:
-    with open("src/c_wrapper/wrap_cl_gl_core.h") as f:
-        ffi.cdef(f.read())
-
-ffi.set_source("pyopencl._cffi",
-        """
-        #include "wrap_cl.h"
-        """,
-        define_macros=list({EXTRA_DEFINES}.items()),
-        include_dirs=(
-            {CL_INC_DIR} + ["src/c_wrapper/"]),
-        library_dirs={CL_LIB_DIR},
-        libraries={CL_LIBNAME},
-        extra_compile_args=({CXXFLAGS}),
-        extra_link_args={LDFLAGS},
-        source_extension=".cpp",
-        sources=[
-            "src/c_wrapper/wrap_cl.cpp",
-            "src/c_wrapper/wrap_constants.cpp",
-            "src/c_wrapper/bitlog.cpp",
-            "src/c_wrapper/pyhelper.cpp",
-            "src/c_wrapper/platform.cpp",
-            "src/c_wrapper/device.cpp",
-            "src/c_wrapper/context.cpp",
-            "src/c_wrapper/command_queue.cpp",
-            "src/c_wrapper/event.cpp",
-            "src/c_wrapper/memory_object.cpp",
-            "src/c_wrapper/svm.cpp",
-            "src/c_wrapper/image.cpp",
-            "src/c_wrapper/gl_obj.cpp",
-            "src/c_wrapper/memory_map.cpp",
-            "src/c_wrapper/buffer.cpp",
-            "src/c_wrapper/sampler.cpp",
-            "src/c_wrapper/program.cpp",
-            "src/c_wrapper/kernel.cpp",
-            "src/c_wrapper/debug.cpp",
-            ]
-        )
-
-
-if __name__ == "__main__":
-    ffi.compile()
diff --git a/cl_gl_types.h b/cl_gl_types.h
deleted file mode 100644
index ea0e7e4df0dffdbb20dffeae094e70a0f6d23243..0000000000000000000000000000000000000000
--- a/cl_gl_types.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* cl_gl.h */
-typedef cl_uint     cl_gl_object_type;
-typedef cl_uint     cl_gl_texture_info;
-typedef cl_uint     cl_gl_platform_info;
-typedef struct __GLsync *cl_GLsync;
-typedef cl_uint     cl_gl_context_info;
-
-/* cl_egl.h */
-typedef void* CLeglImageKHR;
-typedef void* CLeglDisplayKHR;
-typedef void* CLeglSyncKHR;
-typedef intptr_t cl_egl_image_properties_khr;
diff --git a/cl_types.h b/cl_types.h
deleted file mode 100644
index 5df1601343b0d2ea5540fab54b1a4c8fabdeab6e..0000000000000000000000000000000000000000
--- a/cl_types.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* gl.h */
-typedef unsigned int    GLenum;
-typedef int             GLint;          /* 4-byte signed */
-typedef unsigned int    GLuint;         /* 4-byte unsigned */
-
-
-/* cl.h */
-/* scalar types */
-typedef int8_t          cl_char;
-typedef uint8_t         cl_uchar;
-typedef int16_t         cl_short;
-typedef uint16_t        cl_ushort;
-typedef int32_t         cl_int;
-typedef uint32_t        cl_uint;
-typedef int64_t         cl_long;
-typedef uint64_t        cl_ulong;
-
-typedef uint16_t        cl_half;
-typedef float                   cl_float;
-typedef double                  cl_double;
-
-
-typedef struct _cl_platform_id *    cl_platform_id;
-typedef struct _cl_device_id *      cl_device_id;
-typedef struct _cl_context *        cl_context;
-typedef struct _cl_command_queue *  cl_command_queue;
-typedef struct _cl_mem *            cl_mem;
-typedef struct _cl_program *        cl_program;
-typedef struct _cl_kernel *         cl_kernel;
-typedef struct _cl_event *          cl_event;
-typedef struct _cl_sampler *        cl_sampler;
-
-/* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be
-the same size as the bool in kernels. */
-typedef cl_uint             cl_bool;
-typedef cl_ulong            cl_bitfield;
-typedef cl_bitfield         cl_device_type;
-typedef cl_uint             cl_platform_info;
-typedef cl_uint             cl_device_info;
-typedef cl_bitfield         cl_device_fp_config;
-typedef cl_uint             cl_device_mem_cache_type;
-typedef cl_uint             cl_device_local_mem_type;
-typedef cl_bitfield         cl_device_exec_capabilities;
-typedef cl_bitfield         cl_device_svm_capabilities; // 2.0
-typedef cl_bitfield         cl_command_queue_properties;
-typedef intptr_t            cl_device_partition_property;
-typedef cl_bitfield         cl_device_affinity_domain;
-
-typedef intptr_t            cl_context_properties;
-typedef cl_uint             cl_context_info;
-typedef cl_uint             cl_command_queue_info;
-typedef cl_uint             cl_channel_order;
-typedef cl_uint             cl_channel_type;
-typedef cl_bitfield         cl_mem_flags;
-typedef cl_bitfield         cl_svm_mem_flags; // 2.0
-typedef cl_uint             cl_mem_object_type;
-typedef cl_uint             cl_mem_info;
-typedef cl_bitfield         cl_mem_migration_flags;
-typedef cl_uint             cl_image_info;
-typedef cl_uint             cl_buffer_create_type;
-typedef cl_uint             cl_addressing_mode;
-typedef cl_uint             cl_filter_mode;
-typedef cl_uint             cl_sampler_info;
-typedef cl_bitfield         cl_map_flags;
-typedef intptr_t            cl_pipe_properties; // 2.0
-typedef cl_uint             cl_pipe_info; // 2.0
-typedef cl_uint             cl_program_info;
-typedef cl_uint             cl_program_build_info;
-typedef cl_uint             cl_program_binary_type;
-typedef cl_int              cl_build_status;
-typedef cl_uint             cl_kernel_info;
-typedef cl_uint             cl_kernel_arg_info;
-typedef cl_uint             cl_kernel_arg_address_qualifier;
-typedef cl_uint             cl_kernel_arg_access_qualifier;
-typedef cl_bitfield         cl_kernel_arg_type_qualifier;
-typedef cl_uint             cl_kernel_work_group_info;
-typedef cl_uint             cl_event_info;
-typedef cl_uint             cl_command_type;
-typedef cl_uint             cl_profiling_info;
-typedef cl_bitfield         cl_sampler_properties; // 2.0
-typedef cl_uint             cl_kernel_exec_info; // 2.0
-
-typedef struct _cl_image_format {
-    cl_channel_order        image_channel_order;
-    cl_channel_type         image_channel_data_type;
-} cl_image_format;
-
-typedef struct _cl_image_desc {
-    cl_mem_object_type      image_type;
-    size_t                  image_width;
-    size_t                  image_height;
-    size_t                  image_depth;
-    size_t                  image_array_size;
-    size_t                  image_row_pitch;
-    size_t                  image_slice_pitch;
-    cl_uint                 num_mip_levels;
-    cl_uint                 num_samples;
-    cl_mem                  buffer;
-} cl_image_desc;
-
-typedef struct _cl_buffer_region {
-    size_t                  origin;
-    size_t                  size;
-} cl_buffer_region;
-
-/* cl_ext.h */
-
-typedef union
-{
-    struct { cl_uint type; cl_uint data[5]; } raw;
-    struct { cl_uint type; cl_char unused[17]; cl_char bus; cl_char device; cl_char function; } pcie;
-} cl_device_topology_amd;
-
-/*
-typedef cl_ulong  cl_device_partition_property_ext;
-typedef cl_uint   cl_image_pitch_info_qcom;
-typedef struct _cl_mem_ext_host_ptr {
-    cl_uint  allocation_type;
-    cl_uint  host_cache_policy;
-} cl_mem_ext_host_ptr;
-typedef struct _cl_mem_ion_host_ptr {
-    cl_mem_ext_host_ptr  ext_host_ptr;
-    int                  ion_filedesc;
-    void*                ion_hostptr;
-} cl_mem_ion_host_ptr;
-
-typedef cl_bitfield         cl_mem_migration_flags_ext;
-*/
diff --git a/doc/index.rst b/doc/index.rst
index f771ab25332ba378c42ae5b403e47e267009af2d..d715d1a2e945f2e7d9e717bfd5add37c244a093a 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -108,6 +108,7 @@ Contents
     runtime_program
     runtime_gl
     array
+    types
     algorithm
     howto
     tools
diff --git a/doc/make_constants.py b/doc/make_constants.py
index c9de4cd87fef509b0bbd5c6ef7e2036af77eb6c4..9ab78ad070ec6d0cc419458335a75ed44f9c9a16 100644
--- a/doc/make_constants.py
+++ b/doc/make_constants.py
@@ -335,7 +335,6 @@ const_ext_lookup = {
             "WRITE_BUFFER_RECT": cl_11,
             "COPY_BUFFER_RECT": cl_11,
             "USER": cl_11,
-            "MIGRATE_MEM_OBJECT_EXT": ("cl_ext_migrate_memobject", "2011.2"),
             "BARRIER": cl_12,
             "MIGRATE_MEM_OBJECTS": cl_12,
             "FILL_BUFFER": cl_12,
@@ -408,9 +407,6 @@ const_ext_lookup = {
             "CONTENT_UNDEFINED": cl_12,
             },
 
-        cl.migrate_mem_object_flags_ext: {
-            "HOST": ("cl_ext_migrate_memobject", "2011.2"),
-            },
         }
 try:
     gl_ci = cl.gl_context_info
diff --git a/doc/misc.rst b/doc/misc.rst
index 7459f89f525cd9d178f3b3814aac0fc3f9ddd527..b4e30db755258400525464fdfd4b88d2e082582d 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -121,6 +121,8 @@ checking `this file
 Note that the triple-quoted strings containing the source must start with
 `"""//CL// ..."""`.
 
+.. _ipython-integration:
+
 IPython integration
 -------------------
 
diff --git a/doc/runtime_memory.rst b/doc/runtime_memory.rst
index 75b60253d3d5c267e4d70707a8714710de2eaae0..a4ad2d5f22b081baa123567f90044dd80089f813 100644
--- a/doc/runtime_memory.rst
+++ b/doc/runtime_memory.rst
@@ -43,15 +43,6 @@ Memory Migration
 
     Only available with CL 1.2.
 
-.. function:: enqueue_migrate_mem_object_ext(queue, mem_objects, flags=0, wait_for=None)
-
-    :param flags: from :class:`migrate_mem_object_flags_ext`
-
-    .. versionadded:: 2011.2
-
-    Only available with the `cl_ext_migrate_memobject`
-    extension.
-
 Buffer
 ------
 
@@ -361,13 +352,28 @@ Mapping Memory into Host Address Space
 Samplers
 --------
 
-.. class:: Sampler(context, normalized_coords, addressing_mode, filter_mode)
+.. class:: Sampler
+
+
+    .. method:: __init__(context, normalized_coords, addressing_mode, filter_mode)
+
+        *normalized_coords* is a :class:`bool` indicating whether
+        to use coordinates between 0 and 1 (*True*) or the texture's
+        natural pixel size (*False*).
+        See :class:`addressing_mode` and :class:`filter_mode` for possible
+        argument values.
+
+    .. method:: __init__(context, properties)
+
+        :arg properties: a sequence
+            of keys and values from :class:`sampler_properties` as accepted
+            by :c:func:`clCreateSamplerWithProperties` (see the OpenCL
+            spec for details). The trailing *0* is added automatically
+            and does not need to be included.
+
+        Requires OpenCL 2 or newer.
 
-    *normalized_coords* is a :class:`bool` indicating whether
-    to use coordinates between 0 and 1 (*True*) or the texture's
-    natural pixel size (*False*).
-    See :class:`addressing_mode` and :class:`filter_mode` for possible
-    argument values.
+        .. versionadded:: 2018.2
 
     .. attribute:: info
 
diff --git a/doc/runtime_queue.rst b/doc/runtime_queue.rst
index b4567953c889acdaafd9c49bc8d04d2de745bbc5..c0b42897d151cd95c1289665e4f5a00d801fc078 100644
--- a/doc/runtime_queue.rst
+++ b/doc/runtime_queue.rst
@@ -13,20 +13,35 @@ Command Queue
     Create a new command queue. *properties* is a bit field
     consisting of :class:`command_queue_properties` values.
 
-    if *device* is None, one of the devices in *context* is chosen
+    If *device* is None, one of the devices in *context* is chosen
     in an implementation-defined manner.
 
+    *properties* may be a bitwise combination of values from
+    :class:`queue_properties` (or *None* which is equivalent to
+    passing *0*). This is compatible with both OpenCL 1.x and 2.x.
+
+    For OpenCL 2.0 and above, *properties* may also be a sequence
+    of keys and values from :class:`queue_properties` as accepted
+    by :c:func:`clCreateCommandQueueWithProperties` (see the OpenCL
+    spec for details). The trailing *0* is added automatically
+    and does not need to be included.
+
     A :class:`CommandQueue` may be used as a context manager, like this::
 
         with cl.CommandQueue(self.cl_context) as queue:
             enqueue_stuff(queue, ...)
 
-    :meth:`finish` is automatically called at the end of the context.
+    :meth:`finish` is automatically called at the end of the ``with``-delimited
+    context.
 
     .. versionadded:: 2013.1
 
         Context manager capability.
 
+    .. versionchanged:: 2018.2
+
+        Added the sequence-of-properties interface for OpenCL 2.
+
     .. attribute:: info
 
         Lower case versions of the :class:`command_queue_info` constants
diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 539349755d4a939baa858ac2ca0c0449f8a8d0e1..d0904c29f54e66cdb403233fd0c984c685d99c5f 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -26,7 +26,7 @@ THE SOFTWARE.
 
 import re
 import six
-from six.moves import input
+from six.moves import input, intern
 
 from pyopencl.version import VERSION, VERSION_STATUS, VERSION_TEXT  # noqa
 
@@ -34,7 +34,7 @@ import logging
 logger = logging.getLogger(__name__)
 
 try:
-    import pyopencl.cffi_cl as _cl
+    import pyopencl._cl as _cl
 except ImportError:
     import os
     from os.path import dirname, join, realpath
@@ -46,7 +46,12 @@ except ImportError:
 
 import numpy as np
 
-from pyopencl.cffi_cl import (  # noqa
+import sys
+
+_PYPY = '__pypy__' in sys.builtin_module_names
+_CPY2 = not _PYPY and sys.version_info < (3,)
+
+from pyopencl._cl import (  # noqa
         get_cl_header_version,
         program_kind,
         status_code,
@@ -94,18 +99,13 @@ from pyopencl.cffi_cl import (  # noqa
         command_execution_status,
         profiling_info,
         mem_migration_flags,
-        mem_migration_flags_ext,
         device_partition_property,
         device_affinity_domain,
-        gl_object_type,
-        gl_texture_info,
-        migrate_mem_object_flags_ext,
 
         Error, MemoryError, LogicError, RuntimeError,
 
         Platform,
         get_platforms,
-        unload_platform_compiler,
 
         Device,
         Context,
@@ -115,30 +115,18 @@ from pyopencl.cffi_cl import (  # noqa
         MemoryObject,
         MemoryMap,
         Buffer,
-        SVMAllocation,
-        SVM,
-        SVMMap,
 
-        CompilerWarning,
         _Program,
         Kernel,
 
         Event,
         wait_for_events,
         NannyEvent,
-        UserEvent,
 
         enqueue_nd_range_kernel,
-        enqueue_task,
 
-        _enqueue_marker_with_wait_list,
         _enqueue_marker,
-        _enqueue_barrier_with_wait_list,
-
-        enqueue_migrate_mem_objects,
-        enqueue_migrate_mem_object_ext,
 
-        _enqueue_barrier_with_wait_list,
         _enqueue_read_buffer,
         _enqueue_write_buffer,
         _enqueue_copy_buffer,
@@ -146,49 +134,106 @@ from pyopencl.cffi_cl import (  # noqa
         _enqueue_write_buffer_rect,
         _enqueue_copy_buffer_rect,
 
-        enqueue_map_buffer,
-        _enqueue_fill_buffer,
         _enqueue_read_image,
         _enqueue_copy_image,
         _enqueue_write_image,
-        enqueue_map_image,
-        enqueue_fill_image,
         _enqueue_copy_image_to_buffer,
         _enqueue_copy_buffer_to_image,
-        enqueue_svm_memfill,
-        enqueue_svm_migratemem,
 
         have_gl,
-        _GLObject,
-        GLBuffer,
-        GLRenderBuffer,
 
         ImageFormat,
         get_supported_image_formats,
 
-        ImageDescriptor,
         Image,
         Sampler,
-        GLTexture,
         DeviceTopologyAmd,
+        )
 
-        add_get_info_attrs as _add_get_info_attrs,
+if not _PYPY:
+    # FIXME: Add back to default set when pypy support catches up
+    from pyopencl._cl import (  # noqa
+        enqueue_map_buffer,
+        enqueue_map_image,
+        )
+
+if get_cl_header_version() >= (1, 1):
+    from pyopencl._cl import (  # noqa
+          UserEvent,
+        )
+if get_cl_header_version() >= (1, 2):
+    from pyopencl._cl import (  # noqa
+        _enqueue_marker_with_wait_list,
+        _enqueue_barrier_with_wait_list,
+
+        unload_platform_compiler,
+
+
+        enqueue_migrate_mem_objects,
+        _enqueue_fill_buffer,
+        enqueue_fill_image,
+
+        ImageDescriptor,
+        )
+
+if get_cl_header_version() >= (2, 0):
+    from pyopencl._cl import (  # noqa
+        SVMAllocation,
+        SVM,
+
+        # FIXME
+        #enqueue_svm_migratemem,
         )
 
 if _cl.have_gl():
+    from pyopencl._cl import (  # noqa
+        gl_object_type,
+        gl_texture_info,
+
+        GLBuffer,
+        GLRenderBuffer,
+        GLTexture,
+        )
+
     try:
-        from pyopencl.cffi_cl import get_apple_cgl_share_group  # noqa
+        from pyopencl._cl import get_apple_cgl_share_group  # noqa
     except ImportError:
         pass
 
     try:
-        from pyopencl.cffi_cl import (  # noqa
+        from pyopencl._cl import (  # noqa
             enqueue_acquire_gl_objects,
             enqueue_release_gl_objects,
         )
     except ImportError:
         pass
 
+import inspect as _inspect
+
+CONSTANT_CLASSES = tuple(
+        getattr(_cl, name) for name in dir(_cl)
+        if _inspect.isclass(getattr(_cl, name))
+        and name[0].islower() and name not in ["zip", "map", "range"])
+
+
+# {{{ diagnostics
+
+class CompilerWarning(UserWarning):
+    pass
+
+
+def compiler_output(text):
+    import os
+    from warnings import warn
+    if int(os.environ.get("PYOPENCL_COMPILER_OUTPUT", "0")):
+        warn(text, CompilerWarning)
+    else:
+        warn("Non-empty compiler output encountered. Set the "
+                "environment variable PYOPENCL_COMPILER_OUTPUT=1 "
+                "to see more.", CompilerWarning)
+
+# }}}
+
 
 # {{{ find pyopencl shipped source code
 
@@ -310,8 +355,8 @@ class Program(object):
         return self._get_prg().int_ptr
     int_ptr = property(int_ptr, doc=_cl._Program.int_ptr.__doc__)
 
-    def from_int_ptr(int_ptr_value):
-        return Program(_cl._Program.from_int_ptr(int_ptr_value))
+    def from_int_ptr(int_ptr_value, retain=True):
+        return Program(_cl._Program.from_int_ptr(int_ptr_value, retain))
     from_int_ptr.__doc__ = _cl._Program.from_int_ptr.__doc__
     from_int_ptr = staticmethod(from_int_ptr)
 
@@ -478,7 +523,7 @@ class Program(object):
         try:
             return build_func()
         except _cl.RuntimeError as e:
-            msg = e.what
+            msg = str(e)
             if options_bytes:
                 msg = msg + "\n(options: %s)" % options_bytes.decode("utf-8")
 
@@ -496,7 +541,7 @@ class Program(object):
             routine = e.routine
 
             err = _cl.RuntimeError(
-                    _cl.Error._ErrorRecord(
+                    _cl._ErrorRecord(
                         msg=msg,
                         code=code,
                         routine=routine))
@@ -522,9 +567,6 @@ class Program(object):
         return hash(self._get_prg())
 
 
-_add_get_info_attrs(Program, Program.get_info, program_info)
-
-
 def create_program_with_built_in_kernels(context, devices, kernel_names):
     if not isinstance(kernel_names, str):
         kernel_names = ":".join(kernel_names)
@@ -540,9 +582,750 @@ def link_program(context, programs, options=[], devices=None):
 # }}}
 
 
+# {{{ monkeypatch C++ wrappers to add functionality
+
+def _add_functionality():
+    def generic_get_cl_version(self):
+        import re
+        version_string = self.version
+        match = re.match(r"^OpenCL ([0-9]+)\.([0-9]+) .*$", version_string)
+        if match is None:
+            raise RuntimeError("%s %s returned non-conformant "
+                               "platform version string '%s'" %
+                               (type(self).__name__, self, version_string))
+
+        return int(match.group(1)), int(match.group(2))
+
+    # {{{ Platform
+
+    def platform_repr(self):
+        return "<pyopencl.Platform '%s' at 0x%x>" % (self.name, self.int_ptr)
+
+    Platform.__repr__ = platform_repr
+    Platform._get_cl_version = generic_get_cl_version
+
+    # }}}
+
+    # {{{ Device
+
+    def device_repr(self):
+        return "<pyopencl.Device '%s' on '%s' at 0x%x>" % (
+                self.name.strip(), self.platform.name.strip(), self.int_ptr)
+
+    def device_persistent_unique_id(self):
+        return (self.vendor, self.vendor_id, self.name, self.version)
+
+    Device.__repr__ = device_repr
+
+    # undocumented for now:
+    Device._get_cl_version = generic_get_cl_version
+    Device.persistent_unique_id = property(device_persistent_unique_id)
+
+    # }}}
+
+    # {{{ Context
+
+    context_old_init = Context.__init__
+
+    def context_init(self, devices, properties, dev_type, cache_dir=None):
+        if cache_dir is not None:
+            from warnings import warn
+            warn("The 'cache_dir' argument to the Context constructor "
+                "is deprecated and no longer has an effect. "
+                "It was removed because it only applied to the wrapper "
+                "object and not the context itself, leading to inconsistencies.",
+                DeprecationWarning, stacklevel=2)
+
+        context_old_init(self, devices, properties, dev_type)
+
+    def context_repr(self):
+        return "<pyopencl.Context at 0x%x on %s>" % (self.int_ptr,
+                ", ".join(repr(dev) for dev in self.devices))
+
+    def context_get_cl_version(self):
+        return self.devices[0].platform._get_cl_version()
+
+    Context.__repr__ = context_repr
+    from pytools import memoize_method
+    Context._get_cl_version = memoize_method(context_get_cl_version)
+
+    # }}}
+
+    # {{{ CommandQueue
+
+    def command_queue_enter(self):
+        return self
+
+    def command_queue_exit(self, exc_type, exc_val, exc_tb):
+        self.finish()
+
+    def command_queue_get_cl_version(self):
+        return self.context._get_cl_version()
+
+    CommandQueue.__enter__ = command_queue_enter
+    CommandQueue.__exit__ = command_queue_exit
+    CommandQueue._get_cl_version = memoize_method(command_queue_get_cl_version)
+
+    # }}}
+
+    # {{{ _Program (the internal, non-caching version)
+
+    def program_get_build_logs(self):
+        build_logs = []
+        for dev in self.get_info(_cl.program_info.DEVICES):
+            try:
+                log = self.get_build_info(dev, program_build_info.LOG)
+            except Exception:
+                log = "<error retrieving log>"
+
+            build_logs.append((dev, log))
+
+        return build_logs
+
+    def program_build(self, options_bytes, devices=None):
+        err = None
+        try:
+            self._build(options=options_bytes, devices=devices)
+        except Error as e:
+            msg = str(e) + "\n\n" + (75*"="+"\n").join(
+                    "Build on %s:\n\n%s" % (dev, log)
+                    for dev, log in self._get_build_logs())
+            code = e.code
+            routine = e.routine
+
+            err = _cl.RuntimeError(
+                    _cl._ErrorRecord(
+                        msg=msg,
+                        code=code,
+                        routine=routine))
+
+        if err is not None:
+            # Python 3.2 outputs the whole list of currently active exceptions
+            # This serves to remove one (redundant) level from that nesting.
+            raise err
+
+        message = (75*"="+"\n").join(
+                "Build on %s succeeded, but said:\n\n%s" % (dev, log)
+                for dev, log in self._get_build_logs()
+                if log is not None and log.strip())
+
+        if message:
+            if self.kind() == program_kind.SOURCE:
+                build_type = "From-source build"
+            elif self.kind() == program_kind.BINARY:
+                build_type = "From-binary build"
+            else:
+                build_type = "Build"
+
+            compiler_output("%s succeeded, but resulted in non-empty logs:\n%s"
+                    % (build_type, message))
+
+        return self
+
+    _cl._Program._get_build_logs = program_get_build_logs
+    _cl._Program.build = program_build
+
+    # }}}
+
+    # {{{ Event
+    class ProfilingInfoGetter:
+        def __init__(self, event):
+            self.event = event
+
+        def __getattr__(self, name):
+            info_cls = _cl.profiling_info
+
+            try:
+                inf_attr = getattr(info_cls, name.upper())
+            except AttributeError:
+                raise AttributeError("%s has no attribute '%s'"
+                        % (type(self), name))
+            else:
+                return self.event.get_profiling_info(inf_attr)
+
+    _cl.Event.profile = property(ProfilingInfoGetter)
+
+    # }}}
+
+    # {{{ Kernel
+
+    kernel_old_init = Kernel.__init__
+    kernel_old_get_info = Kernel.get_info
+    kernel_old_get_work_group_info = Kernel.get_work_group_info
+
+    def kernel_init(self, prg, name):
+        if not isinstance(prg, _cl._Program):
+            prg = prg._get_prg()
+
+        kernel_old_init(self, prg, name)
+
+        self._setup(prg)
+
+    def kernel__setup(self, prg):
+        self._source = getattr(prg, "_source", None)
+
+        from pyopencl.invoker import generate_enqueue_and_set_args
+        self._enqueue, self._set_args = generate_enqueue_and_set_args(
+                self.function_name, self.num_args, self.num_args,
+                None,
+                warn_about_arg_count_bug=None,
+                work_around_arg_count_bug=None)
+
+        self._wg_info_cache = {}
+        return self
+
+    def kernel_set_scalar_arg_dtypes(self, scalar_arg_dtypes):
+        self._scalar_arg_dtypes = tuple(scalar_arg_dtypes)
+
+        # {{{ arg counting bug handling
+
+        # For example:
+        # https://github.com/pocl/pocl/issues/197
+        # (but Apple CPU has a similar bug)
+
+        work_around_arg_count_bug = False
+        warn_about_arg_count_bug = False
+
+        from pyopencl.characterize import has_struct_arg_count_bug
+
+        count_bug_per_dev = [
+                has_struct_arg_count_bug(dev, self.context)
+                for dev in self.context.devices]
+
+        from pytools import single_valued
+        if any(count_bug_per_dev):
+            if all(count_bug_per_dev):
+                work_around_arg_count_bug = single_valued(count_bug_per_dev)
+            else:
+                warn_about_arg_count_bug = True
+
+        # }}}
+
+        from pyopencl.invoker import generate_enqueue_and_set_args
+        self._enqueue, self._set_args = generate_enqueue_and_set_args(
+                self.function_name,
+                len(scalar_arg_dtypes), self.num_args,
+                self._scalar_arg_dtypes,
+                warn_about_arg_count_bug=warn_about_arg_count_bug,
+                work_around_arg_count_bug=work_around_arg_count_bug)
+
+    def kernel_get_work_group_info(self, param, device):
+        try:
+            return self._wg_info_cache[param, device]
+        except KeyError:
+            pass
+
+        result = kernel_old_get_work_group_info(self, param, device)
+        self._wg_info_cache[param, device] = result
+        return result
+
+    def kernel_set_args(self, *args, **kwargs):
+        # Need to dupicate the 'self' argument for dynamically generated  method
+        return self._set_args(self, *args, **kwargs)
+
+    def kernel_call(self, queue, global_size, local_size, *args, **kwargs):
+        # __call__ can't be overridden directly, so we need this
+        # trampoline hack.
+        return self._enqueue(self, queue, global_size, local_size, *args, **kwargs)
+
+    def kernel_capture_call(self, filename, queue, global_size, local_size,
+            *args, **kwargs):
+        from pyopencl.capture_call import capture_kernel_call
+        capture_kernel_call(self, filename, queue, global_size, local_size,
+                *args, **kwargs)
+
+    def kernel_get_info(self, param_name):
+        val = kernel_old_get_info(self, param_name)
+
+        if isinstance(val, _Program):
+            return Program(val)
+        else:
+            return val
+
+    Kernel.__init__ = kernel_init
+    Kernel._setup = kernel__setup
+    Kernel.get_work_group_info = kernel_get_work_group_info
+    Kernel.set_scalar_arg_dtypes = kernel_set_scalar_arg_dtypes
+    Kernel.set_args = kernel_set_args
+    Kernel.__call__ = kernel_call
+    Kernel.capture_call = kernel_capture_call
+    Kernel.get_info = kernel_get_info
+
+    # }}}
+
+    # {{{ ImageFormat
+
+    def image_format_repr(self):
+        return "ImageFormat(%s, %s)" % (
+                channel_order.to_string(self.channel_order,
+                    "<unknown channel order 0x%x>"),
+                channel_type.to_string(self.channel_data_type,
+                    "<unknown channel data type 0x%x>"))
+
+    def image_format_eq(self, other):
+        return (self.channel_order == other.channel_order
+                and self.channel_data_type == other.channel_data_type)
+
+    def image_format_ne(self, other):
+        return not image_format_eq(self, other)
+
+    def image_format_hash(self):
+        return hash((type(self), self.channel_order, self.channel_data_type))
+
+    ImageFormat.__repr__ = image_format_repr
+    ImageFormat.__eq__ = image_format_eq
+    ImageFormat.__ne__ = image_format_ne
+    ImageFormat.__hash__ = image_format_hash
+
+    # }}}
+
+    # {{{ Image
+
+    image_old_init = Image.__init__
+
+    def image_init(self, context, flags, format, shape=None, pitches=None,
+            hostbuf=None, is_array=False, buffer=None):
+
+        if shape is None and hostbuf is None:
+            raise Error("'shape' must be passed if 'hostbuf' is not given")
+
+        if shape is None and hostbuf is not None:
+            shape = hostbuf.shape
+
+        if hostbuf is not None and not \
+                (flags & (mem_flags.USE_HOST_PTR | mem_flags.COPY_HOST_PTR)):
+            from warnings import warn
+            warn("'hostbuf' was passed, but no memory flags to make use of it.")
+
+        if hostbuf is None and pitches is not None:
+            raise Error("'pitches' may only be given if 'hostbuf' is given")
+
+        if context._get_cl_version() >= (1, 2) and get_cl_header_version() >= (1, 2):
+            if buffer is not None and is_array:
+                    raise ValueError(
+                            "'buffer' and 'is_array' are mutually exclusive")
+
+            if len(shape) == 3:
+                if buffer is not None:
+                    raise TypeError(
+                            "'buffer' argument is not supported for 3D arrays")
+                elif is_array:
+                    image_type = mem_object_type.IMAGE2D_ARRAY
+                else:
+                    image_type = mem_object_type.IMAGE3D
+
+            elif len(shape) == 2:
+                if buffer is not None:
+                    raise TypeError(
+                            "'buffer' argument is not supported for 2D arrays")
+                elif is_array:
+                    image_type = mem_object_type.IMAGE1D_ARRAY
+                else:
+                    image_type = mem_object_type.IMAGE2D
+
+            elif len(shape) == 1:
+                if buffer is not None:
+                    image_type = mem_object_type.IMAGE1D_BUFFER
+                elif is_array:
+                    raise TypeError("array of zero-dimensional images not supported")
+                else:
+                    image_type = mem_object_type.IMAGE1D
+
+            else:
+                raise ValueError("images cannot have more than three dimensions")
+
+            desc = ImageDescriptor()
+
+            desc.image_type = image_type
+            desc.shape = shape  # also sets desc.array_size
+
+            if pitches is None:
+                desc.pitches = (0, 0)
+            else:
+                desc.pitches = pitches
+
+            desc.num_mip_levels = 0  # per CL 1.2 spec
+            desc.num_samples = 0  # per CL 1.2 spec
+            desc.buffer = buffer
+
+            image_old_init(self, context, flags, format, desc, hostbuf)
+        else:
+            # legacy init for CL 1.1 and older
+            if is_array:
+                raise TypeError("'is_array=True' is not supported for CL < 1.2")
+            # if num_mip_levels is not None:
+                # raise TypeError(
+                #       "'num_mip_levels' argument is not supported for CL < 1.2")
+            # if num_samples is not None:
+                # raise TypeError(
+                #        "'num_samples' argument is not supported for CL < 1.2")
+            if buffer is not None:
+                raise TypeError("'buffer' argument is not supported for CL < 1.2")
+
+            image_old_init(self, context, flags, format, shape,
+                    pitches, hostbuf)
+
+    class _ImageInfoGetter:
+        def __init__(self, event):
+            from warnings import warn
+            warn("Image.image.attr is deprecated. "
+                    "Use Image.attr directly, instead.")
+
+            self.event = event
+
+        def __getattr__(self, name):
+            try:
+                inf_attr = getattr(_cl.image_info, name.upper())
+            except AttributeError:
+                raise AttributeError("%s has no attribute '%s'"
+                        % (type(self), name))
+            else:
+                return self.event.get_image_info(inf_attr)
+
+    def image_shape(self):
+        if self.type == mem_object_type.IMAGE2D:
+            return (self.width, self.height)
+        elif self.type == mem_object_type.IMAGE3D:
+            return (self.width, self.height, self.depth)
+        else:
+            raise LogicError("only images have shapes")
+
+    Image.__init__ = image_init
+    Image.image = property(_ImageInfoGetter)
+    Image.shape = property(image_shape)
+
+    # }}}
+
+    # {{{ Error
+
+    def error_str(self):
+        val = self.what
+        try:
+            val.routine
+        except AttributeError:
+            return str(val)
+        else:
+            result = ""
+            if val.code() != status_code.SUCCESS:
+                result = status_code.to_string(
+                        val.code(), "<unknown error %d>")
+            routine = val.routine()
+            if routine:
+                result = "%s failed: %s" % (routine, result)
+            what = val.what()
+            if what:
+                if result:
+                    result += " - "
+                result += what
+            return result
+
+    def error_code(self):
+        return self.args[0].code()
+
+    def error_routine(self):
+        return self.args[0].routine()
+
+    def error_what(self):
+        return self.args[0]
+
+    Error.__str__ = error_str
+    Error.code = property(error_code)
+    Error.routine = property(error_routine)
+    Error.what = property(error_what)
+
+    # }}}
+
+    # {{{ MemoryMap
+
+    def memory_map_enter(self):
+        return self
+
+    def memory_map_exit(self, exc_type, exc_val, exc_tb):
+        self.release()
+
+    MemoryMap.__doc__ = """
+        This class may also be used as a context manager in a ``with`` statement.
+        The memory corresponding to this object will be unmapped when
+        this object is deleted or :meth:`release` is called.
+
+        .. automethod:: release
+        """
+    MemoryMap.__enter__ = memory_map_enter
+    MemoryMap.__exit__ = memory_map_exit
+
+    # }}}
+
+    # {{{ SVMAllocation
+
+    if get_cl_header_version() >= (2, 0):
+        SVMAllocation.__doc__ = """An object whose lifetime is tied to an allocation of shared virtual memory.
+
+            .. note::
+
+                Most likely, you will not want to use this directly, but rather
+                :func:`svm_empty` and related functions which allow access to this
+                functionality using a friendlier, more Pythonic interface.
+
+            .. versionadded:: 2016.2
+
+            .. automethod:: __init__(self, ctx, size, alignment, flags=None)
+            .. automethod:: release
+            .. automethod:: enqueue_release
+            """
+
+    if get_cl_header_version() >= (2, 0):
+        svmallocation_old_init = SVMAllocation.__init__
+
+    def svmallocation_init(self, ctx, size, alignment, flags, _interface=None):
+        """
+        :arg ctx: a :class:`Context`
+        :arg flags: some of :class:`svm_mem_flags`.
+        """
+        svmallocation_old_init(self, ctx, size, alignment, flags)
+
+        read_write = (
+                flags & mem_flags.WRITE_ONLY != 0
+                or flags & mem_flags.READ_WRITE != 0)
+
+        _interface["data"] = (
+                int(self._ptr_as_int()), not read_write)
+
+        self.__array_interface__ = _interface
+
+    if get_cl_header_version() >= (2, 0):
+        SVMAllocation.__init__ = svmallocation_init
+
+    # }}}
+
+    # {{{ SVM
+
+    if get_cl_header_version() >= (2, 0):
+        SVM.__doc__ = """Tags an object exhibiting the Python buffer interface (such as a
+            :class:`numpy.ndarray`) as referring to shared virtual memory.
+
+            Depending on the features of the OpenCL implementation, the following
+            types of objects may be passed to/wrapped in this type:
+
+            *   coarse-grain shared memory as returned by (e.g.) :func:`csvm_empty`
+                for any implementation of OpenCL 2.0.
+
+                This is how coarse-grain SVM may be used from both host and device::
+
+                    svm_ary = cl.SVM(
+                        cl.csvm_empty(ctx, 1000, np.float32, alignment=64))
+                    assert isinstance(svm_ary.mem, np.ndarray)
+
+                    with svm_ary.map_rw(queue) as ary:
+                        ary.fill(17)  # use from host
+
+                    prg.twice(queue, svm_ary.mem.shape, None, svm_ary)
+
+            *   fine-grain shared memory as returned by (e.g.) :func:`fsvm_empty`,
+                if the implementation supports fine-grained shared virtual memory.
+                This memory may directly be passed to a kernel::
+
+                    ary = cl.fsvm_empty(ctx, 1000, np.float32)
+                    assert isinstance(ary, np.ndarray)
+
+                    prg.twice(queue, ary.shape, None, cl.SVM(ary))
+                    queue.finish() # synchronize
+                    print(ary) # access from host
+
+                Observe how mapping (as needed in coarse-grain SVM) is no longer
+                necessary.
+
+            *   any :class:`numpy.ndarray` (or other Python object with a buffer
+                interface) if the implementation supports fine-grained *system*
+                shared virtual memory.
+
+                This is how plain :mod:`numpy` arrays may directly be passed to a
+                kernel::
+
+                    ary = np.zeros(1000, np.float32)
+                    prg.twice(queue, ary.shape, None, cl.SVM(ary))
+                    queue.finish() # synchronize
+                    print(ary) # access from host
+
+            Objects of this type may be passed to kernel calls and
+            :func:`enqueue_copy`.  Coarse-grain shared-memory *must* be mapped
+            into host address space using :meth:`map` before being accessed
+            through the :mod:`numpy` interface.
+
+            .. note::
+
+                This object merely serves as a 'tag' that changes the behavior
+                of functions to which it is passed. It has no special management
+                relationship to the memory it tags. For example, it is permissible
+                to grab a :mod:`numpy.array` out of :attr:`SVM.mem` of one
+                :class:`SVM` instance and use the array to construct another.
+                Neither of the tags need to be kept alive.
+
+            .. versionadded:: 2016.2
+
+            .. attribute:: mem
+
+                The wrapped object.
+
+            .. automethod:: __init__
+            .. automethod:: map
+            .. automethod:: map_ro
+            .. automethod:: map_rw
+            .. automethod:: as_buffer
+            """
+
+    if get_cl_header_version() >= (2, 0):
+        svm_old_init = SVM.__init__
+
+    def svm_init(self, mem):
+        svm_old_init(self, mem)
+
+        self.mem = mem
+
+    def svm_map(self, queue, flags, is_blocking=True, wait_for=None):
+        """
+        :arg is_blocking: If *False*, subsequent code must wait on
+            :attr:`SVMMap.event` in the returned object before accessing the
+            mapped memory.
+        :arg flags: a combination of :class:`pyopencl.map_flags`, defaults to
+            read-write.
+        :returns: an :class:`SVMMap` instance
+
+        |std-enqueue-blurb|
+        """
+        return SVMMap(
+                self,
+                queue,
+                _cl._enqueue_svm_map(queue, is_blocking, flags, self, wait_for))
+
+    def svm_map_ro(self, queue, is_blocking=True, wait_for=None):
+        """Like :meth:`map`, but with *flags* set for a read-only map."""
+
+        return self.map(queue, map_flags.READ,
+                is_blocking=is_blocking, wait_for=wait_for)
+
+    def svm_map_rw(self, queue, is_blocking=True, wait_for=None):
+        """Like :meth:`map`, but with *flags* set for a read-only map."""
+
+        return self.map(queue, map_flags.READ | map_flags.WRITE,
+                is_blocking=is_blocking, wait_for=wait_for)
+
+    def svm__enqueue_unmap(self, queue, wait_for=None):
+        return _cl._enqueue_svm_unmap(queue, self, wait_for)
+
+    def svm_as_buffer(self, ctx, flags=None):
+        """
+        :arg ctx: a :class:`Context`
+        :arg flags: a combination of :class:`pyopencl.map_flags`, defaults to
+            read-write.
+        :returns: a :class:`Buffer` corresponding to *self*.
+
+        The memory referred to by this object must not be freed before
+        the returned :class:`Buffer` is released.
+        """
+
+        if flags is None:
+            flags = mem_flags.READ_WRITE
+
+        return Buffer(ctx, flags, size=self.mem.nbytes, hostbuf=self.mem)
+
+    if get_cl_header_version() >= (2, 0):
+        SVM.__init__ = svm_init
+        SVM.map = svm_map
+        SVM.map_ro = svm_map_ro
+        SVM.map_rw = svm_map_rw
+        SVM._enqueue_unmap = svm__enqueue_unmap
+        SVM.as_buffer = svm_as_buffer
+
+    # }}}
+
+    # ORDER DEPENDENCY: Some of the above may override get_info, the effect needs
+    # to be visible through the attributes. So get_info attr creation needs to happen
+    # after the overriding is complete.
+    cls_to_info_cls = {
+            _cl.Platform: (_cl.Platform.get_info, _cl.platform_info, []),
+            _cl.Device: (_cl.Device.get_info, _cl.device_info,
+                ["PLATFORM", "MAX_WORK_GROUP_SIZE", "MAX_COMPUTE_UNITS"]),
+            _cl.Context: (_cl.Context.get_info, _cl.context_info, []),
+            _cl.CommandQueue: (_cl.CommandQueue.get_info, _cl.command_queue_info,
+                ["CONTEXT", "DEVICE"]),
+            _cl.Event: (_cl.Event.get_info, _cl.event_info, []),
+            _cl.MemoryObjectHolder:
+            (MemoryObjectHolder.get_info, _cl.mem_info, []),
+            Image: (_cl.Image.get_image_info, _cl.image_info, []),
+            Program: (Program.get_info, _cl.program_info, []),
+            Kernel: (Kernel.get_info, _cl.kernel_info, []),
+            _cl.Sampler: (Sampler.get_info, _cl.sampler_info, []),
+            }
+
+    def to_string(cls, value, default_format=None):
+        for name in dir(cls):
+            if (not name.startswith("_") and getattr(cls, name) == value):
+                return name
+
+        if default_format is None:
+            raise ValueError("a name for value %d was not found in %s"
+                    % (value, cls.__name__))
+        else:
+            return default_format % value
+
+    for cls in CONSTANT_CLASSES:
+        cls.to_string = classmethod(to_string)
+
+    # {{{ get_info attributes -------------------------------------------------
+
+    def make_getinfo(info_method, info_name, info_attr):
+        def result(self):
+            return info_method(self, info_attr)
+
+        return property(result)
+
+    def make_cacheable_getinfo(info_method, info_name, cache_attr, info_attr):
+        def result(self):
+            try:
+                return getattr(self, cache_attr)
+            except AttributeError:
+                pass
+
+            result = info_method(self, info_attr)
+            setattr(self, cache_attr, result)
+            return result
+
+        return property(result)
+
+    for cls, (info_method, info_class, cacheable_attrs) \
+            in six.iteritems(cls_to_info_cls):
+        for info_name, info_value in six.iteritems(info_class.__dict__):
+            if info_name == "to_string" or info_name.startswith("_"):
+                continue
+
+            info_lower = info_name.lower()
+            info_constant = getattr(info_class, info_name)
+            if info_name in cacheable_attrs:
+                cache_attr = intern("_info_cache_"+info_lower)
+                setattr(cls, info_lower, make_cacheable_getinfo(
+                    info_method, info_lower, cache_attr, info_constant))
+            else:
+                setattr(cls, info_lower, make_getinfo(
+                        info_method, info_name, info_constant))
+
+    # }}}
+
+    if _cl.have_gl():
+        def gl_object_get_gl_object(self):
+            return self.get_gl_object_info()[1]
+
+        GLBuffer.gl_object = property(gl_object_get_gl_object)
+        GLTexture.gl_object = property(gl_object_get_gl_object)
+
+
+_add_functionality()
+
+# }}}
+
+
 # {{{ create_some_context
 
-def create_some_context(interactive=None, answers=None, cache_dir=None):
+def create_some_context(interactive=None, answers=None):
     import os
     if answers is None:
         if "PYOPENCL_CTX" in os.environ:
@@ -553,7 +1336,7 @@ def create_some_context(interactive=None, answers=None, cache_dir=None):
             from pyopencl.tools import get_test_platforms_and_devices
             for plat, devs in get_test_platforms_and_devices():
                 for dev in devs:
-                    return Context([dev], cache_dir=cache_dir)
+                    return Context([dev])
 
     if answers is not None:
         pre_provided_answers = answers
@@ -668,7 +1451,7 @@ def create_some_context(interactive=None, answers=None, cache_dir=None):
         raise RuntimeError("not all provided choices were used by "
                 "create_some_context. (left over: '%s')" % ":".join(answers))
 
-    return Context(devices, cache_dir=cache_dir)
+    return Context(devices)
 
 
 _csc = create_some_context
@@ -676,46 +1459,54 @@ _csc = create_some_context
 # }}}
 
 
-# {{{ enqueue_copy
+# {{{ SVMMap
 
-def _mark_copy_deprecated(func):
-    def new_func(*args, **kwargs):
-        from warnings import warn
-        warn("'%s' has been deprecated in version 2011.1. Please use "
-                "enqueue_copy() instead." % func.__name__[1:], DeprecationWarning,
-                stacklevel=2)
-        return func(*args, **kwargs)
+class SVMMap(object):
+    """
+    .. attribute:: event
 
-    try:
-        from functools import update_wrapper
-    except ImportError:
-        pass
-    else:
-        try:
-            update_wrapper(new_func, func)
-        except AttributeError:
-            pass
+    .. versionadded:: 2016.2
+
+    .. automethod:: release
+
+    This class may also be used as a context manager in a ``with`` statement.
+    :meth:`release` will be called upon exit from the ``with`` region.
+    The value returned to the ``as`` part of the context manager is the
+    mapped Python object (e.g. a :mod:`numpy` array).
+    """
+    def __init__(self, svm, queue, event):
+        self.svm = svm
+        self.queue = queue
+        self.event = event
 
-    return new_func
+    def __del__(self):
+        if self.svm is not None:
+            self.release()
 
+    def __enter__(self):
+        return self.svm.mem
 
-enqueue_read_image = _mark_copy_deprecated(_cl._enqueue_read_image)
-enqueue_write_image = _mark_copy_deprecated(_cl._enqueue_write_image)
-enqueue_copy_image = _mark_copy_deprecated(_cl._enqueue_copy_image)
-enqueue_copy_image_to_buffer = _mark_copy_deprecated(
-        _cl._enqueue_copy_image_to_buffer)
-enqueue_copy_buffer_to_image = _mark_copy_deprecated(
-        _cl._enqueue_copy_buffer_to_image)
-enqueue_read_buffer = _mark_copy_deprecated(_cl._enqueue_read_buffer)
-enqueue_write_buffer = _mark_copy_deprecated(_cl._enqueue_write_buffer)
-enqueue_copy_buffer = _mark_copy_deprecated(_cl._enqueue_copy_buffer)
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.release()
 
+    def release(self, queue=None, wait_for=None):
+        """
+        :arg queue: a :class:`pyopencl.CommandQueue`. Defaults to the one
+            with which the map was created, if not specified.
+        :returns: a :class:`pyopencl.Event`
 
-if _cl.get_cl_header_version() >= (1, 1):
-    enqueue_read_buffer_rect = _mark_copy_deprecated(_cl._enqueue_read_buffer_rect)
-    enqueue_write_buffer_rect = _mark_copy_deprecated(_cl._enqueue_write_buffer_rect)
-    enqueue_copy_buffer_rect = _mark_copy_deprecated(_cl._enqueue_copy_buffer_rect)
+        |std-enqueue-blurb|
+        """
 
+        evt = self.svm._enqueue_unmap(self.queue)
+        self.svm = None
+
+        return evt
+
+# }}}
+
+
+# {{{ enqueue_copy
 
 def enqueue_copy(queue, dest, src, **kwargs):
     """Copy from :class:`Image`, :class:`Buffer` or the host to
@@ -891,12 +1682,13 @@ def enqueue_copy(queue, dest, src, **kwargs):
         else:
             raise ValueError("invalid dest mem object type")
 
-    elif isinstance(dest, SVM):
+    elif get_cl_header_version() >= (2, 0) and isinstance(dest, SVM):
         # to SVM
         if isinstance(src, SVM):
             src = src.mem
 
         return _cl._enqueue_svm_memcpy(queue, dest.mem, src, **kwargs)
+
     else:
         # assume to-host
 
@@ -1039,6 +1831,10 @@ def enqueue_fill_buffer(queue, mem, pattern, offset, size, wait_for=None):
         from warnings import warn
         warn("The context for this queue does not declare OpenCL 1.2 support, so "
                 "the next thing you might see is a crash")
+
+    if _PYPY and isinstance(pattern, np.generic):
+        pattern = np.asarray(pattern)
+
     return _cl._enqueue_fill_buffer(queue, mem, pattern, offset, size, wait_for)
 
 # }}}
@@ -1046,6 +1842,48 @@ def enqueue_fill_buffer(queue, mem, pattern, offset, size, wait_for=None):
 
 # {{{ numpy-like svm allocation
 
+def enqueue_svm_memfill(queue, dest, pattern, byte_count=None, wait_for=None):
+    """Fill shared virtual memory with a pattern.
+
+    :arg dest: a Python buffer object, optionally wrapped in an :class:`SVM` object
+    :arg pattern: a Python buffer object (e.g. a :class:`numpy.ndarray` with the
+        fill pattern to be used.
+    :arg byte_count: The size of the memory to be fill. Defaults to the
+        entirety of *dest*.
+
+    |std-enqueue-blurb|
+
+    .. versionadded:: 2016.2
+    """
+
+    if not isinstance(dest, SVM):
+        dest = SVM(dest)
+
+    return _cl._enqueue_svm_memfill(
+            queue, dest, pattern, byte_count=None, wait_for=None)
+
+
+def enqueue_svm_migratemem(queue, svms, flags, wait_for=None):
+    """
+    :arg svms: a collection of Python buffer objects (e.g. :mod:`numpy`
+        arrrays), optionally wrapped in :class:`SVM` objects.
+    :arg flags: a combination of :class:`mem_migration_flags`
+
+    |std-enqueue-blurb|
+
+    .. versionadded:: 2016.2
+
+    This function requires OpenCL 2.1.
+    """
+
+    return _cl._enqueue_svm_migratemem(
+            queue,
+            [svm.mem if isinstance(svm, SVM) else svm
+                for svm in svms],
+            flags,
+            wait_for)
+
+
 def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None):
     """Allocate an empty :class:`numpy.ndarray` of the given *shape*, *dtype*
     and *order*. (See :func:`numpy.empty` for the meaning of these arguments.)
@@ -1193,4 +2031,14 @@ def fsvm_empty_like(ctx, ary, alignment=None):
 
 # }}}
 
+
+_KERNEL_ARG_CLASSES = (
+        MemoryObjectHolder,
+        Sampler,
+        LocalMemory,
+        )
+if get_cl_header_version() >= (2, 0):
+    _KERNEL_ARG_CLASSES = _KERNEL_ARG_CLASSES + (SVM,)
+
+
 # vim: foldmethod=marker
diff --git a/pyopencl/array.py b/pyopencl/array.py
index 704c495b4bff7ceed3d4808244a8bdb844f267b5..a4a5f4cffa57a314192878aec926f99285954b78 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -663,7 +663,8 @@ class Array(object):
         if ary is None:
             ary = np.empty(self.shape, self.dtype)
 
-            ary = _as_strided(ary, strides=self.strides)
+            if self.strides != ary.strides:
+                ary = _as_strided(ary, strides=self.strides)
         else:
             if ary.size != self.size:
                 raise TypeError("'ary' has non-matching size")
diff --git a/pyopencl/cache.py b/pyopencl/cache.py
index 22e55c404a7a5d742f8e511f04308ed437acdf8c..48b6270edcdc107b1aa006b4202feb3e6a29b36f 100644
--- a/pyopencl/cache.py
+++ b/pyopencl/cache.py
@@ -26,7 +26,7 @@ THE SOFTWARE.
 
 import six
 from six.moves import zip
-import pyopencl.cffi_cl as _cl
+import pyopencl._cl as _cl
 import re
 import sys
 import os
@@ -374,7 +374,7 @@ def _create_built_program_from_source_cached(ctx, src, options_bytes,
             if log is not None and log.strip())
 
     if message:
-        from pyopencl.cffi_cl import compiler_output
+        from pyopencl import compiler_output
         compiler_output(
                 "Built kernel retrieved from cache. Original from-source "
                 "build had warnings:\n"+message)
diff --git a/pyopencl/cffi_cl.py b/pyopencl/cffi_cl.py
deleted file mode 100644
index c5effc7b35dbd3e64b88b8eaa56016809797f147..0000000000000000000000000000000000000000
--- a/pyopencl/cffi_cl.py
+++ /dev/null
@@ -1,2954 +0,0 @@
-from __future__ import division, absolute_import
-
-__copyright__ = """
-Copyright (C) 2013 Marko Bencun
-Copyright (C) 2014 Andreas Kloeckner
-Copyright (C) 2014 Yichao Yu
-"""
-
-__license__ = """
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-"""
-
-import six
-from six.moves import map, range, zip, intern
-
-import warnings
-import numpy as np
-import sys
-import re
-
-from pytools import memoize_method
-
-from pyopencl._cffi import ffi as _ffi
-from .compyte.array import f_contiguous_strides, c_contiguous_strides
-
-
-from pyopencl._cffi import lib as _lib
-
-import logging
-logger = logging.getLogger(__name__)
-
-
-class _CLKernelArg(object):
-    pass
-
-
-# {{{ hook up connections between the wrapper and the interperter
-
-import gc
-_py_gc = _ffi.callback('int(void)')(gc.collect)
-
-_pyrefs = {}
-
-
-@_ffi.callback('void(void*)')
-def _py_deref(handle):
-    try:
-        del _pyrefs[handle]
-    except Exception:
-        pass
-
-
-# return a new reference of the object pointed to by the handle.
-# The return value might be different with the input (on PyPy).
-# _py_deref should be called (once) when the object is not needed anymore.
-@_ffi.callback('void*(void*)')
-def _py_ref(handle):
-    obj = _ffi.from_handle(handle)
-    handle = _ffi.new_handle(obj)
-    _pyrefs[handle] = handle
-    return handle
-
-
-@_ffi.callback('void(void*, cl_int)')
-def _py_call(handle, status):
-    _ffi.from_handle(handle)(status)
-
-
-_lib.set_py_funcs(_py_gc, _py_ref, _py_deref, _py_call)
-
-# }}}
-
-
-# {{{ compatibility shims
-
-# are we running on pypy?
-_PYPY = '__pypy__' in sys.builtin_module_names
-_CPY2 = not _PYPY and sys.version_info < (3,)
-
-try:
-    _unicode = eval('unicode')
-    _ffi_pystr = _ffi.string
-except Exception:
-    _unicode = str
-    _bytes = bytes
-
-    def _ffi_pystr(s):
-        return _ffi.string(s).decode() if s else None
-else:
-    try:
-        _bytes = bytes
-    except Exception:
-        _bytes = str
-
-
-def _to_cstring(s):
-    if isinstance(s, _unicode):
-        return s.encode()
-    return s
-
-# }}}
-
-
-# {{{ wrapper tools
-
-# {{{ _CArray helper classes
-
-class _CArray(object):
-    def __init__(self, ptr):
-        self.ptr = ptr
-        self.size = _ffi.new('uint32_t*')
-
-    def __del__(self):
-        if self.ptr != _ffi.NULL:
-            _lib.free_pointer(self.ptr[0])
-
-    def __getitem__(self, key):
-        return self.ptr[0].__getitem__(key)
-
-    def __iter__(self):
-        for i in range(self.size[0]):
-            yield self[i]
-
-# }}}
-
-
-# {{{ GetInfo support
-
-def _generic_info_to_python(info):
-    type_ = _ffi_pystr(info.type)
-
-    if info.free_type:
-        _lib.free_pointer(info.type)
-
-    value = _ffi.cast(type_, info.value)
-
-    if info.opaque_class != _lib.CLASS_NONE:
-        klass = {
-            _lib.CLASS_PLATFORM: Platform,
-            _lib.CLASS_DEVICE: Device,
-            _lib.CLASS_KERNEL: Kernel,
-            _lib.CLASS_CONTEXT: Context,
-            _lib.CLASS_BUFFER: Buffer,
-            _lib.CLASS_PROGRAM: _Program,
-            _lib.CLASS_EVENT: Event,
-            _lib.CLASS_COMMAND_QUEUE: CommandQueue
-            }[info.opaque_class]
-
-        if klass is _Program:
-            def create_inst(val):
-                from pyopencl import Program
-                return Program(_Program._create(val))
-
-        else:
-            create_inst = klass._create
-
-        if type_.endswith(']'):
-            ret = list(map(create_inst, value))
-            _lib.free_pointer(info.value)
-            return ret
-        else:
-            return create_inst(value)
-
-    if type_ == 'char*':
-        ret = _ffi_pystr(value)
-    elif type_ == 'cl_device_topology_amd*':
-        ret = DeviceTopologyAmd(
-                value.pcie.bus, value.pcie.device, value.pcie.function)
-    elif type_ == 'cl_image_format*':
-        ret = ImageFormat(value.image_channel_order,
-                               value.image_channel_data_type)
-    elif type_.startswith('char*['):
-        ret = list(map(_ffi_pystr, value))
-        _lib.free_pointer_array(info.value, len(value))
-    elif type_.endswith(']'):
-        if type_.startswith('char['):
-            # This is usually a CL binary, which may contain NUL characters
-            # that should be preserved.
-            ret = _bytes(_ffi.buffer(value))
-
-        elif type_.startswith('generic_info['):
-            ret = list(map(_generic_info_to_python, value))
-        elif type_.startswith('cl_image_format['):
-            ret = [ImageFormat(imf.image_channel_order,
-                               imf.image_channel_data_type)
-                   for imf in value]
-        else:
-            ret = list(value)
-    else:
-        ret = value[0]
-    if info.free_value:
-        _lib.free_pointer(info.value)
-    return ret
-
-# }}}
-
-
-def _clobj_list(objs):
-    if objs is None:
-        return _ffi.NULL, 0
-    return [ev.ptr for ev in objs], len(objs)
-
-
-# {{{ common base class
-
-class _Common(object):
-    @classmethod
-    def _create(cls, ptr):
-        self = cls.__new__(cls)
-        self.ptr = ptr
-        return self
-    ptr = _ffi.NULL
-
-    # {{{ cleanup
-
-    # The module-global _lib variable may get set to None during interpreter
-    # cleanup before we're done cleaning up CL objects. (Symbols starting with
-    # an underscore even get cleared first [1]--although it's unclear that that
-    # really matters.) To retain our ability to clean up objects, retain a
-    # reference to the _lib module.
-    #
-    # [1] https://www.python.org/doc/essays/cleanup/
-
-    _retained_lib = _lib
-
-    def __del__(self):
-        self._retained_lib.clobj__delete(self.ptr)
-
-    # }}}
-
-    def __eq__(self, other):
-        return other.int_ptr == self.int_ptr
-
-    def __ne__(self, other):
-        return not self.__eq__(other)
-
-    def __hash__(self):
-        return _lib.clobj__int_ptr(self.ptr)
-
-    def get_info(self, param):
-        info = _ffi.new('generic_info*')
-        _handle_error(_lib.clobj__get_info(self.ptr, param, info))
-        return _generic_info_to_python(info)
-
-    @property
-    def int_ptr(self):
-        return _lib.clobj__int_ptr(self.ptr)
-
-    @classmethod
-    def from_int_ptr(cls, int_ptr_value, retain=True):
-        """Constructs a :mod:`pyopencl` handle from a C-level pointer (given as
-        the integer *int_ptr_value*). If *retain* is *True* (the default)
-        :mod:`pyopencl` will call ``clRetainXXX`` on the provided object. If
-        the previous owner of the object will *not* release the reference,
-        *retain* should be set to *False*, to effectively transfer ownership to
-        :mod:`pyopencl`.
-
-        .. versionchanged:: 2016.1
-
-            *retain* added
-        """
-        ptr = _ffi.new('clobj_t*')
-        _handle_error(_lib.clobj__from_int_ptr(
-            ptr, int_ptr_value, getattr(_lib, 'CLASS_%s' % cls._id.upper()),
-            retain))
-        return cls._create(ptr[0])
-
-# }}}
-
-# }}}
-
-
-def get_cl_header_version():
-    v = _lib.get_cl_version()
-    return (v >> (3 * 4),
-            (v >> (1 * 4)) & 0xff)
-
-
-# {{{ constants
-
-_constants = {}
-
-
-# {{{ constant classes
-
-class _ConstantsNamespace(object):
-    def __init__(self):
-        raise RuntimeError("This class cannot be instantiated.")
-
-    @classmethod
-    def to_string(cls, value, default_format=None):
-        for name in dir(cls):
-            if (not name.startswith("_") and getattr(cls, name) == value):
-                return name
-
-        if default_format is None:
-            raise ValueError("a name for value %d was not found in %s"
-                    % (value, cls.__name__))
-        else:
-            return default_format % value
-
-
-# /!\ If you add anything here, add it to pyopencl/__init__.py as well.
-
-class program_kind(_ConstantsNamespace):  # noqa
-    pass
-
-
-class status_code(_ConstantsNamespace):  # noqa
-    pass
-
-
-class platform_info(_ConstantsNamespace):  # noqa
-    pass
-
-
-class device_type(_ConstantsNamespace):  # noqa
-    @classmethod
-    def to_string(cls, value, default_format=None):
-        for name in dir(cls):
-            if name in ("DEFAULT", "ALL"):
-                continue
-            if not name.startswith("_"):
-                bitfield = getattr(cls, name)
-                if (isinstance(bitfield, six.integer_types)
-                        and ((bitfield & value) == bitfield)):
-                    return name
-
-        if default_format is None:
-            raise ValueError("a name for value %d was not found in %s"
-                             % (value, cls.__name__))
-        else:
-            return default_format % value
-
-
-class device_info(_ConstantsNamespace):  # noqa
-    pass
-
-
-class device_fp_config(_ConstantsNamespace):  # noqa
-    pass
-
-
-class device_mem_cache_type(_ConstantsNamespace):  # noqa
-    pass
-
-
-class device_local_mem_type(_ConstantsNamespace):  # noqa
-    pass
-
-
-class device_exec_capabilities(_ConstantsNamespace):  # noqa
-    pass
-
-
-class device_svm_capabilities(_ConstantsNamespace):  # noqa
-    pass
-
-
-class command_queue_properties(_ConstantsNamespace):  # noqa
-    pass
-
-
-class context_info(_ConstantsNamespace):  # noqa
-    pass
-
-
-class gl_context_info(_ConstantsNamespace):  # noqa
-    pass
-
-
-class context_properties(_ConstantsNamespace):  # noqa
-    pass
-
-
-class command_queue_info(_ConstantsNamespace):  # noqa
-    pass
-
-
-class queue_properties(_ConstantsNamespace):  # noqa
-    pass
-
-
-class mem_flags(_ConstantsNamespace):  # noqa
-    @classmethod
-    def _writable(cls, flags):
-        return flags & (cls.READ_WRITE | cls.WRITE_ONLY)
-
-    @classmethod
-    def _hold_host(cls, flags):
-        return flags & cls.USE_HOST_PTR
-
-    @classmethod
-    def _use_host(cls, flags):
-        return flags & (cls.USE_HOST_PTR | cls.COPY_HOST_PTR)
-
-    @classmethod
-    def _host_writable(cls, flags):
-        return cls._writable(flags) and cls._hold_host(flags)
-
-
-class svm_mem_flags(_ConstantsNamespace):  # noqa
-    pass
-
-
-class channel_order(_ConstantsNamespace):  # noqa
-    pass
-
-
-class channel_type(_ConstantsNamespace):  # noqa
-    pass
-
-
-class mem_object_type(_ConstantsNamespace):  # noqa
-    pass
-
-
-class mem_info(_ConstantsNamespace):  # noqa
-    pass
-
-
-class image_info(_ConstantsNamespace):  # noqa
-    pass
-
-
-class addressing_mode(_ConstantsNamespace):  # noqa
-    pass
-
-
-class filter_mode(_ConstantsNamespace):  # noqa
-    pass
-
-
-class sampler_info(_ConstantsNamespace):  # noqa
-    pass
-
-
-class map_flags(_ConstantsNamespace):  # noqa
-    pass
-
-
-class program_info(_ConstantsNamespace):  # noqa
-    pass
-
-
-class program_build_info(_ConstantsNamespace):  # noqa
-    pass
-
-
-class program_binary_type(_ConstantsNamespace):  # noqa
-    pass
-
-
-class kernel_info(_ConstantsNamespace):  # noqa
-    pass
-
-
-class kernel_arg_info(_ConstantsNamespace):  # noqa
-    pass
-
-
-class kernel_arg_address_qualifier(_ConstantsNamespace):  # noqa
-    pass
-
-
-class kernel_arg_access_qualifier(_ConstantsNamespace):  # noqa
-    pass
-
-
-class kernel_arg_type_qualifier(_ConstantsNamespace):  # noqa
-    pass
-
-
-class kernel_work_group_info(_ConstantsNamespace):  # noqa
-    pass
-
-
-class event_info(_ConstantsNamespace):  # noqa
-    pass
-
-
-class command_type(_ConstantsNamespace):  # noqa
-    pass
-
-
-class command_execution_status(_ConstantsNamespace):  # noqa
-    pass
-
-
-class profiling_info(_ConstantsNamespace):  # noqa
-    pass
-
-
-class mem_migration_flags(_ConstantsNamespace):  # noqa
-    pass
-
-
-class mem_migration_flags_ext(_ConstantsNamespace):  # noqa
-    pass
-
-
-class device_partition_property(_ConstantsNamespace):  # noqa
-    pass
-
-
-class device_affinity_domain(_ConstantsNamespace):  # noqa
-    pass
-
-
-class gl_object_type(_ConstantsNamespace):  # noqa
-    pass
-
-
-class gl_texture_info(_ConstantsNamespace):  # noqa
-    pass
-
-
-class migrate_mem_object_flags_ext(_ConstantsNamespace):  # noqa
-    pass
-
-# }}}
-
-
-_locals = locals()
-
-
-# TODO: constant values are cl_ulong
-@_ffi.callback('void (*)(const char*, const char* name, int64_t value)')
-def _constant_callback(type_, name, value):
-    setattr(_locals[_ffi_pystr(type_)], _ffi_pystr(name), value)  # noqa
-
-
-_lib.populate_constants(_constant_callback)
-
-del _locals
-del _constant_callback
-
-# }}}
-
-
-# {{{ exceptions
-
-class Error(Exception):
-    class _ErrorRecord(object):
-        __slots__ = ('_routine', '_code', '_what')
-
-        def __init__(self, msg='', code=0, routine=''):
-            self._routine = routine
-            assert isinstance(code, six.integer_types)
-            self._code = code
-            self._what = msg
-
-        def routine(self):
-            return self._routine
-
-        def code(self):
-            return self._code
-
-        def what(self):
-            return self._what
-
-    def __init__(self, *a, **kw):
-        if len(a) == 1 and not kw and hasattr(a[0], 'what'):
-            super(Error, self).__init__(a[0])
-        else:
-            super(Error, self).__init__(self._ErrorRecord(*a, **kw))
-
-    def __str__(self):
-        val = self.args[0]
-        try:
-            val.routine
-        except AttributeError:
-            return str(val)
-        else:
-            result = ""
-            if val.code() != status_code.SUCCESS:
-                result = status_code.to_string(
-                        val.code(), "<unknown error %d>")
-            routine = val.routine()
-            if routine:
-                result = "%s failed: %s" % (routine, result)
-            what = val.what()
-            if what:
-                if result:
-                    result += " - "
-                result += what
-            return result
-
-    @property
-    def code(self):
-        return self.args[0].code()
-
-    @property
-    def routine(self):
-        return self.args[0].routine()
-
-    @property
-    def what(self):
-        return self.args[0].what()
-
-    def is_out_of_memory(self):
-        # matches C implementation in src/c_wrapper/error.h
-        val = self.args[0]
-
-        return (val.code == status_code.MEM_OBJECT_ALLOCATION_FAILURE
-                or val.code == status_code.OUT_OF_RESOURCES
-                or val.code == status_code.OUT_OF_HOST_MEMORY)
-
-
-class MemoryError(Error):
-    pass
-
-
-class LogicError(Error):
-    pass
-
-
-_py_RuntimeError = RuntimeError
-
-
-class RuntimeError(Error):
-    pass
-
-
-def _handle_error(error):
-    if error == _ffi.NULL:
-        return
-    if error.other == 1:
-        # non-pyopencl exceptions are handled here
-        e = _py_RuntimeError(_ffi_pystr(error.msg))
-        _lib.free_pointer(error.msg)
-        _lib.free_pointer(error)
-        raise e
-    if error.code == status_code.MEM_OBJECT_ALLOCATION_FAILURE:
-        klass = MemoryError
-    elif error.code <= status_code.INVALID_VALUE:
-        klass = LogicError
-    elif status_code.INVALID_VALUE < error.code < status_code.SUCCESS:
-        klass = RuntimeError
-    else:
-        klass = Error
-
-    e = klass(routine=_ffi_pystr(error.routine),
-              code=error.code, msg=_ffi_pystr(error.msg))
-    _lib.free_pointer(error.routine)
-    _lib.free_pointer(error.msg)
-    _lib.free_pointer(error)
-    raise e
-
-# }}}
-
-
-# {{{ Platform
-
-class Platform(_Common):
-    _id = 'platform'
-
-    def get_devices(self, device_type=device_type.ALL):
-        devices = _CArray(_ffi.new('clobj_t**'))
-        _handle_error(_lib.platform__get_devices(
-            self.ptr, devices.ptr, devices.size, device_type))
-        return [Device._create(devices.ptr[0][i])
-                for i in range(devices.size[0])]
-
-    def __repr__(self):
-        return "<pyopencl.Platform '%s' at 0x%x>" % (self.name, self.int_ptr)
-
-
-def unload_platform_compiler(plat):
-    _handle_error(_lib.platform__unload_compiler(plat.ptr))
-
-
-def get_platforms():
-    platforms = _CArray(_ffi.new('clobj_t**'))
-    _handle_error(_lib.get_platforms(platforms.ptr, platforms.size))
-    return [Platform._create(platforms.ptr[0][i])
-            for i in range(platforms.size[0])]
-
-# }}}
-
-
-# {{{ Device
-
-class Device(_Common):
-    _id = 'device'
-
-    def create_sub_devices(self, props):
-        props = tuple(props) + (0,)
-        devices = _CArray(_ffi.new('clobj_t**'))
-        _handle_error(_lib.device__create_sub_devices(
-            self.ptr, devices.ptr, devices.size, props))
-        return [Device._create(devices.ptr[0][i])
-                for i in range(devices.size[0])]
-
-    def __repr__(self):
-        return "<pyopencl.Device '%s' on '%s' at 0x%x>" % (
-                self.name.strip(), self.platform.name.strip(), self.int_ptr)
-
-    @property
-    def persistent_unique_id(self):
-        return (self.vendor, self.vendor_id, self.name, self.version)
-
-# }}}
-
-
-# {{{ {Device,Platform}._get_cl_version
-
-_OPENCL_VERSION_STRING_RE = re.compile(r"^OpenCL ([0-9]+)\.([0-9]+) .*$")
-
-
-def _platdev_get_cl_version(self):
-    version_string = self.version
-    match = _OPENCL_VERSION_STRING_RE.match(version_string)
-    if match is None:
-        raise RuntimeError("platform %s returned non-conformant "
-                           "platform version string '%s'" %
-                           (self, version_string))
-
-    return int(match.group(1)), int(match.group(2))
-
-
-Platform._get_cl_version = _platdev_get_cl_version
-Device._get_cl_version = _platdev_get_cl_version
-
-# }}}
-
-
-# {{{ Context
-
-def _parse_context_properties(properties):
-    if properties is None:
-        return _ffi.NULL
-
-    props = []
-    for prop_tuple in properties:
-        if len(prop_tuple) != 2:
-            raise RuntimeError("property tuple must have length 2",
-                               status_code.INVALID_VALUE, "Context")
-
-        prop, value = prop_tuple
-        if prop is None:
-            raise RuntimeError("invalid context property",
-                               status_code.INVALID_VALUE, "Context")
-
-        props.append(prop)
-        if prop == context_properties.PLATFORM:
-            props.append(value.int_ptr)
-
-        elif prop == getattr(context_properties, "WGL_HDC_KHR", None):
-            props.append(ctypes.c_ssize_t(value).value)
-
-        elif prop in [getattr(context_properties, key, None) for key in (
-                'CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE',
-                'GL_CONTEXT_KHR',
-                'EGL_DISPLAY_KHR',
-                'GLX_DISPLAY_KHR',
-                'CGL_SHAREGROUP_KHR',
-                )]:
-
-            from ctypes import _Pointer, cast
-            if isinstance(value, _Pointer):
-                val = cast(value, ctypes.c_void_p).value
-            else:
-                val = int(value)
-
-            if not val:
-                raise LogicError("You most likely have not initialized "
-                                 "OpenGL properly.",
-                                 status_code.INVALID_VALUE, "Context")
-            props.append(val)
-        else:
-            raise RuntimeError("invalid context property",
-                               status_code.INVALID_VALUE, "Context")
-    props.append(0)
-    return props
-
-
-class Context(_Common):
-    _id = 'context'
-
-    def __init__(self, devices=None, properties=None, dev_type=None, cache_dir=None):
-        c_props = _parse_context_properties(properties)
-        status_code = _ffi.new('cl_int*')
-
-        _ctx = _ffi.new('clobj_t*')
-        if devices is not None:
-            # from device list
-            if dev_type is not None:
-                raise RuntimeError("one of 'devices' or 'dev_type' "
-                                   "must be None",
-                                   status_code.INVALID_VALUE, "Context")
-            _devices, num_devices = _clobj_list(devices)
-            # TODO parameter order? (for clobj_list)
-            _handle_error(_lib.create_context(_ctx, c_props,
-                                              num_devices, _devices))
-
-        else:
-            # from device type
-            if dev_type is None:
-                dev_type = device_type.DEFAULT
-            _handle_error(_lib.create_context_from_type(_ctx, c_props,
-                                                        dev_type))
-
-        self.ptr = _ctx[0]
-        self.cache_dir = cache_dir
-
-    def __repr__(self):
-        return "<pyopencl.Context at 0x%x on %s>" % (self.int_ptr,
-                ", ".join(repr(dev) for dev in self.devices))
-
-    @memoize_method
-    def _get_cl_version(self):
-        return self.devices[0].platform._get_cl_version()
-
-# }}}
-
-
-# {{{ CommandQueue
-
-class CommandQueue(_Common):
-    _id = 'command_queue'
-
-    def __init__(self, context, device=None, properties=None):
-        if properties is None:
-            properties = 0
-
-        ptr_command_queue = _ffi.new('clobj_t*')
-
-        _handle_error(_lib.create_command_queue(
-            ptr_command_queue, context.ptr,
-            _ffi.NULL if device is None else device.ptr, properties))
-
-        self.ptr = ptr_command_queue[0]
-
-    def finish(self):
-        _handle_error(_lib.command_queue__finish(self.ptr))
-
-    def flush(self):
-        _handle_error(_lib.command_queue__flush(self.ptr))
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.finish()
-
-    def _get_cl_version(self):
-        return self.device._get_cl_version()
-
-
-# }}}
-
-
-# {{{ _norm_shape_dtype and cffi_array
-
-def _norm_shape_dtype(shape, dtype, order="C", strides=None, name=""):
-    dtype = np.dtype(dtype)
-    if not isinstance(shape, tuple):
-        try:
-            shape = tuple(shape)
-        except Exception:
-            shape = (shape,)
-    if strides is None:
-        if order in "cC":
-            strides = c_contiguous_strides(dtype.itemsize, shape)
-        elif order in "fF":
-            strides = f_contiguous_strides(dtype.itemsize, shape)
-        else:
-            raise RuntimeError("unrecognized order specifier %s" % order,
-                               status_code.INVALID_VALUE, name)
-    return dtype, shape, strides
-
-
-class cffi_array(np.ndarray):  # noqa
-    __array_priority__ = -100.0
-
-    def __new__(cls, buf, shape, dtype, strides, base=None):
-        self = np.ndarray.__new__(cls, shape, dtype=dtype,
-                                  buffer=buf, strides=strides)
-        if base is None:
-            base = buf
-        self.__base = base
-        return self
-
-    @property
-    def base(self):
-        return self.__base
-
-# }}}
-
-
-# {{{ MemoryObjectHolder base class
-
-class MemoryObjectHolder(_Common, _CLKernelArg):
-    def get_host_array(self, shape, dtype, order="C"):
-        dtype, shape, strides = _norm_shape_dtype(
-            shape, dtype, order, None, 'MemoryObjectHolder.get_host_array')
-        _hostptr = _ffi.new('void**')
-        _size = _ffi.new('size_t*')
-        _handle_error(_lib.memory_object__get_host_array(self.ptr, _hostptr,
-                                                         _size))
-        ary = cffi_array(_ffi.buffer(_hostptr[0], _size[0]), shape,
-                         dtype, strides, self)
-        if ary.nbytes > _size[0]:
-            raise LogicError("Resulting array is larger than memory object.",
-                             status_code.INVALID_VALUE,
-                             "MemoryObjectHolder.get_host_array")
-        return ary
-
-# }}}
-
-
-# {{{ MemoryObject
-
-class MemoryObject(MemoryObjectHolder):
-    def __init__(self, hostbuf=None):
-        self.__hostbuf = hostbuf
-
-    def _handle_buf_flags(self, flags):
-        if self.__hostbuf is None:
-            return _ffi.NULL, 0, None
-        if not mem_flags._use_host(flags):
-            warnings.warn("'hostbuf' was passed, but no memory flags "
-                          "to make use of it.")
-
-        need_retain = mem_flags._hold_host(flags)
-        c_hostbuf, hostbuf_size, retained_buf = _c_buffer_from_obj(
-            self.__hostbuf, writable=mem_flags._host_writable(flags),
-            retain=need_retain)
-        if need_retain:
-            self.__retained_buf = retained_buf
-        return c_hostbuf, hostbuf_size, retained_buf
-
-    @property
-    def hostbuf(self):
-        return self.__hostbuf
-
-    def release(self):
-        _handle_error(_lib.memory_object__release(self.ptr))
-
-# }}}
-
-
-# {{{ MemoryMap
-
-class MemoryMap(_Common):
-    """
-    This class may also be used as a context manager in a ``with`` statement.
-    The memory corresponding to this object will be unmapped when
-    this object is deleted or :meth:`release` is called.
-
-    .. automethod:: release
-    """
-
-    @classmethod
-    def _create(cls, ptr, shape, typestr, strides):
-        self = _Common._create.__func__(cls, ptr)
-        self.__array_interface__ = {
-            'shape': shape,
-            'typestr': typestr,
-            'strides': strides,
-            'data': (int(_lib.clobj__int_ptr(self.ptr)), False),
-            'version': 3
-        }
-        return self
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.release()
-
-    def release(self, queue=None, wait_for=None):
-        c_wait_for, num_wait_for = _clobj_list(wait_for)
-        _event = _ffi.new('clobj_t*')
-        _handle_error(_lib.memory_map__release(
-            self.ptr, queue.ptr if queue is not None else _ffi.NULL,
-            c_wait_for, num_wait_for, _event))
-        return Event._create(_event[0])
-
-# }}}
-
-
-# {{{ _c_buffer_from_obj
-
-if _PYPY:
-    # Convert a Python object to a tuple (ptr, num_bytes, ref) to be able to
-    # pass a data stream to a C function where @ptr can be passed to a pointer
-    # argument and @num_bytes is the number of bytes. For certain types or
-    # when @writable or @retain is True, @ref is the object which keep the
-    # pointer converted from @ptr object valid.
-
-    def _c_buffer_from_obj(obj, writable=False, retain=False):
-        if isinstance(obj, bytes):
-            if writable:
-                # bytes is not writable
-                raise TypeError('expected an object with a writable '
-                                'buffer interface.')
-            if retain:
-                buf = _ffi.new('char[]', obj)
-                return (buf, len(obj), buf)
-            return (obj, len(obj), obj)
-        elif isinstance(obj, np.ndarray):
-            # numpy array
-            return (_ffi.cast('void*', obj.__array_interface__['data'][0]),
-                    obj.nbytes, obj)
-        elif isinstance(obj, np.generic):
-            if writable or retain:
-                raise TypeError('expected an object with a writable '
-                                'buffer interface.')
-
-            return (_ffi.cast('void*', memoryview(obj)._pypy_raw_address()),
-                    obj.itemsize, obj)
-        else:
-            raise LogicError("PyOpencl on PyPy only accepts numpy arrays "
-                             "and scalars arguments", status_code.INVALID_VALUE)
-
-elif sys.version_info >= (2, 7, 4):
-    import ctypes
-    try:
-        # Python 2.6 doesn't have this.
-        _ssize_t = ctypes.c_ssize_t
-    except AttributeError:
-        _ssize_t = ctypes.c_size_t
-
-    def _c_buffer_from_obj(obj, writable=False, retain=False):
-        # {{{ try the numpy array interface first
-
-        # avoid slow ctypes-based buffer interface wrapper
-
-        ary_intf = getattr(obj, "__array_interface__", None)
-        if ary_intf is not None:
-            buf_base, is_read_only = ary_intf["data"]
-            return (
-                    _ffi.cast('void*', buf_base + ary_intf.get("offset", 0)),
-                    obj.nbytes,
-                    obj)
-
-        # }}}
-
-        # {{{ fall back to the old CPython buffer protocol API
-
-        from pyopencl._buffers import Py_buffer, PyBUF_ANY_CONTIGUOUS, PyBUF_WRITABLE
-
-        flags = PyBUF_ANY_CONTIGUOUS
-        if writable:
-            flags |= PyBUF_WRITABLE
-
-        with Py_buffer.from_object(obj, flags) as buf:
-            return _ffi.cast('void*', buf.buf), buf.len, obj
-
-        # }}}
-
-else:
-    # Py2.6 and below
-
-    import ctypes
-    try:
-        # Python 2.6 doesn't have this.
-        _ssize_t = ctypes.c_ssize_t
-    except AttributeError:
-        _ssize_t = ctypes.c_size_t
-
-    def _c_buffer_from_obj(obj, writable=False, retain=False):
-        # {{{ fall back to the old CPython buffer protocol API
-
-        addr = ctypes.c_void_p()
-        length = _ssize_t()
-
-        try:
-            if writable:
-                ctypes.pythonapi.PyObject_AsWriteBuffer(
-                    ctypes.py_object(obj), ctypes.byref(addr),
-                    ctypes.byref(length))
-            else:
-                ctypes.pythonapi.PyObject_AsReadBuffer(
-                    ctypes.py_object(obj), ctypes.byref(addr),
-                    ctypes.byref(length))
-
-                # ctypes check exit status of these, so no need to check
-                # for errors.
-        except TypeError:
-            raise LogicError(routine=None, code=status_code.INVALID_VALUE,
-                             msg=("un-sized (pure-Python) types not "
-                                  "acceptable as arguments"))
-        # }}}
-
-        return _ffi.cast('void*', addr.value), length.value, obj
-
-# }}}
-
-
-# {{{ Buffer
-
-class Buffer(MemoryObject):
-    _id = 'buffer'
-
-    def __init__(self, context, flags, size=0, hostbuf=None):
-        MemoryObject.__init__(self, hostbuf)
-        c_hostbuf, hostbuf_size, retained_buf = self._handle_buf_flags(flags)
-        if hostbuf is not None:
-            if size > hostbuf_size:
-                raise RuntimeError("Specified size is greater than host "
-                                   "buffer size",
-                                   status_code.INVALID_VALUE, "Buffer")
-            if size == 0:
-                size = hostbuf_size
-
-        ptr_buffer = _ffi.new('clobj_t*')
-        _handle_error(_lib.create_buffer(
-            ptr_buffer, context.ptr, flags, size, c_hostbuf))
-        self.ptr = ptr_buffer[0]
-
-    def get_sub_region(self, origin, size, flags=0):
-        _sub_buf = _ffi.new('clobj_t*')
-        _handle_error(_lib.buffer__get_sub_region(_sub_buf, self.ptr, origin,
-                                                  size, flags))
-        sub_buf = self._create(_sub_buf[0])
-        MemoryObject.__init__(sub_buf, None)
-        return sub_buf
-
-    def __getitem__(self, idx):
-        if not isinstance(idx, slice):
-            raise TypeError("buffer subscript must be a slice object")
-
-        start, stop, stride = idx.indices(self.size)
-        if stride != 1:
-            raise ValueError("Buffer slice must have stride 1",
-                               status_code.INVALID_VALUE, "Buffer.__getitem__")
-
-        assert start <= stop
-
-        size = stop - start
-        return self.get_sub_region(start, size)
-
-# }}}
-
-
-# {{{ SVMAllocation
-
-class SVMAllocation(object):
-    """An object whose lifetime is tied to an allocation of shared virtual memory.
-
-    .. note::
-
-        Most likely, you will not want to use this directly, but rather
-        :func:`svm_empty` and related functions which allow access to this
-        functionality using a friendlier, more Pythonic interface.
-
-    .. versionadded:: 2016.2
-
-    .. automethod:: __init__(self, ctx, size, alignment, flags=None)
-    .. automethod:: release
-    .. automethod:: enqueue_release
-    """
-    def __init__(self, ctx, size, alignment, flags, _interface=None):
-        """
-        :arg ctx: a :class:`Context`
-        :arg flags: some of :class:`svm_mem_flags`.
-        """
-
-        self.ptr = None
-
-        ptr = _ffi.new('void**')
-        _handle_error(_lib.svm_alloc(
-            ctx.ptr, flags, size, alignment,
-            ptr))
-
-        self.ctx = ctx
-        self.ptr = ptr[0]
-        self.is_fine_grain = flags & svm_mem_flags.SVM_FINE_GRAIN_BUFFER
-
-        if _interface is not None:
-            read_write = (
-                    flags & mem_flags.WRITE_ONLY != 0
-                    or flags & mem_flags.READ_WRITE != 0)
-            _interface["data"] = (
-                    int(_ffi.cast("intptr_t", self.ptr)), not read_write)
-            self.__array_interface__ = _interface
-
-    def __del__(self):
-        if self.ptr is not None:
-            self.release()
-
-    def release(self):
-        _handle_error(_lib.svm_free(self.ctx.ptr, self.ptr))
-        self.ptr = None
-
-    def enqueue_release(self, queue, wait_for=None):
-        """
-        :arg flags: a combination of :class:`pyopencl.map_flags`
-        :returns: a :class:`pyopencl.Event`
-
-        |std-enqueue-blurb|
-        """
-        ptr_event = _ffi.new('clobj_t*')
-        c_wait_for, num_wait_for = _clobj_list(wait_for)
-        _handle_error(_lib.enqueue_svm_free(
-            ptr_event, queue.ptr, 1, self.ptr,
-            c_wait_for, num_wait_for))
-
-        self.ctx = None
-        self.ptr = None
-
-        return Event._create(ptr_event[0])
-
-# }}}
-
-
-# {{{ SVM
-
-# TODO add clSetKernelExecInfo
-
-class SVM(_CLKernelArg):
-    """Tags an object exhibiting the Python buffer interface (such as a
-    :class:`numpy.ndarray`) as referring to shared virtual memory.
-
-    Depending on the features of the OpenCL implementation, the following
-    types of objects may be passed to/wrapped in this type:
-
-    *   coarse-grain shared memory as returned by (e.g.) :func:`csvm_empty`
-        for any implementation of OpenCL 2.0.
-
-        This is how coarse-grain SVM may be used from both host and device::
-
-            svm_ary = cl.SVM(cl.csvm_empty(ctx, 1000, np.float32, alignment=64))
-            assert isinstance(svm_ary.mem, np.ndarray)
-
-            with svm_ary.map_rw(queue) as ary:
-                ary.fill(17)  # use from host
-
-            prg.twice(queue, svm_ary.mem.shape, None, svm_ary)
-
-    *   fine-grain shared memory as returned by (e.g.) :func:`fsvm_empty`,
-        if the implementation supports fine-grained shared virtual memory.
-        This memory may directly be passed to a kernel::
-
-            ary = cl.fsvm_empty(ctx, 1000, np.float32)
-            assert isinstance(ary, np.ndarray)
-
-            prg.twice(queue, ary.shape, None, cl.SVM(ary))
-            queue.finish() # synchronize
-            print(ary) # access from host
-
-        Observe how mapping (as needed in coarse-grain SVM) is no longer
-        necessary.
-
-    *   any :class:`numpy.ndarray` (or other Python object with a buffer
-        interface) if the implementation supports fine-grained *system* shared
-        virtual memory.
-
-        This is how plain :mod:`numpy` arrays may directly be passed to a
-        kernel::
-
-            ary = np.zeros(1000, np.float32)
-            prg.twice(queue, ary.shape, None, cl.SVM(ary))
-            queue.finish() # synchronize
-            print(ary) # access from host
-
-    Objects of this type may be passed to kernel calls and :func:`enqueue_copy`.
-    Coarse-grain shared-memory *must* be mapped into host address space using
-    :meth:`map` before being accessed through the :mod:`numpy` interface.
-
-    .. note::
-
-        This object merely serves as a 'tag' that changes the behavior
-        of functions to which it is passed. It has no special management
-        relationship to the memory it tags. For example, it is permissible
-        to grab a :mod:`numpy.array` out of :attr:`SVM.mem` of one
-        :class:`SVM` instance and use the array to construct another.
-        Neither of the tags need to be kept alive.
-
-    .. versionadded:: 2016.2
-
-    .. attribute:: mem
-
-        The wrapped object.
-
-    .. automethod:: __init__
-    .. automethod:: map
-    .. automethod:: map_ro
-    .. automethod:: map_rw
-    .. automethod:: as_buffer
-    """
-
-    def __init__(self, mem):
-        self.mem = mem
-
-    def map(self, queue, flags, is_blocking=True, wait_for=None):
-        """
-        :arg is_blocking: If *False*, subsequent code must wait on
-            :attr:`SVMMap.event` in the returned object before accessing the
-            mapped memory.
-        :arg flags: a combination of :class:`pyopencl.map_flags`, defaults to
-            read-write.
-        :returns: an :class:`SVMMap` instance
-
-        |std-enqueue-blurb|
-        """
-        writable = bool(
-            flags & (map_flags.WRITE | map_flags.WRITE_INVALIDATE_REGION))
-        c_buf, size, _ = _c_buffer_from_obj(self.mem, writable=writable)
-
-        ptr_event = _ffi.new('clobj_t*')
-        c_wait_for, num_wait_for = _clobj_list(wait_for)
-        _handle_error(_lib.enqueue_svm_map(
-            ptr_event, queue.ptr, is_blocking, flags,
-            c_buf, size,
-            c_wait_for, num_wait_for))
-
-        evt = Event._create(ptr_event[0])
-        return SVMMap(self, queue, evt)
-
-    def map_ro(self, queue, is_blocking=True, wait_for=None):
-        """Like :meth:`map`, but with *flags* set for a read-only map."""
-
-        return self.map(queue, map_flags.READ,
-                is_blocking=is_blocking, wait_for=wait_for)
-
-    def map_rw(self, queue, is_blocking=True, wait_for=None):
-        """Like :meth:`map`, but with *flags* set for a read-only map."""
-
-        return self.map(queue, map_flags.READ | map_flags.WRITE,
-                is_blocking=is_blocking, wait_for=wait_for)
-
-    def _enqueue_unmap(self, queue, wait_for=None):
-        c_buf, _, _ = _c_buffer_from_obj(self.mem)
-
-        ptr_event = _ffi.new('clobj_t*')
-        c_wait_for, num_wait_for = _clobj_list(wait_for)
-        _handle_error(_lib.enqueue_svm_unmap(
-            ptr_event, queue.ptr,
-            c_buf,
-            c_wait_for, num_wait_for))
-
-        return Event._create(ptr_event[0])
-
-    def as_buffer(self, ctx, flags=None):
-        """
-        :arg ctx: a :class:`Context`
-        :arg flags: a combination of :class:`pyopencl.map_flags`, defaults to
-            read-write.
-        :returns: a :class:`Buffer` corresponding to *self*.
-
-        The memory referred to by this object must not be freed before
-        the returned :class:`Buffer` is released.
-        """
-
-        if flags is None:
-            flags = mem_flags.READ_WRITE
-
-        return Buffer(ctx, flags, size=self.mem.nbytes, hostbuf=self.mem)
-
-
-def _enqueue_svm_memcpy(queue, dst, src, size=None,
-        wait_for=None, is_blocking=True):
-    dst_buf, dst_size, _ = _c_buffer_from_obj(dst, writable=True)
-    src_buf, src_size, _ = _c_buffer_from_obj(src, writable=False)
-
-    if size is None:
-        size = min(dst_size, src_size)
-
-    ptr_event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _handle_error(_lib.enqueue_svm_memcpy(
-        ptr_event, queue.ptr, bool(is_blocking),
-        dst_buf, src_buf, size,
-        c_wait_for, num_wait_for,
-        NannyEvent._handle((dst_buf, src_buf))))
-
-    return NannyEvent._create(ptr_event[0])
-
-
-def enqueue_svm_memfill(queue, dest, pattern, byte_count=None, wait_for=None):
-    """Fill shared virtual memory with a pattern.
-
-    :arg dest: a Python buffer object, optionally wrapped in an :class:`SVM` object
-    :arg pattern: a Python buffer object (e.g. a :class:`numpy.ndarray` with the
-        fill pattern to be used.
-    :arg byte_count: The size of the memory to be fill. Defaults to the
-        entirety of *dest*.
-
-    |std-enqueue-blurb|
-
-    .. versionadded:: 2016.2
-    """
-
-    if isinstance(dest, SVM):
-        dest = dest.mem
-
-    dst_buf, dst_size, _ = _c_buffer_from_obj(dest, writable=True)
-    pattern_buf, pattern_size, _ = _c_buffer_from_obj(pattern, writable=False)
-
-    if byte_count is None:
-        byte_count = dst_size
-
-    # pattern is copied, no need to nanny.
-    ptr_event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _handle_error(_lib.enqueue_svm_memfill(
-        ptr_event, queue.ptr,
-        dst_buf, pattern_buf, pattern_size, byte_count,
-        c_wait_for, num_wait_for))
-
-    return Event._create(ptr_event[0])
-
-
-def enqueue_svm_migratemem(queue, svms, flags, wait_for=None):
-    """
-    :arg svms: a collection of Python buffer objects (e.g. :mod:`numpy`
-        arrrays), optionally wrapped in :class:`SVM` objects.
-    :arg flags: a combination of :class:`mem_migration_flags`
-
-    |std-enqueue-blurb|
-
-    .. versionadded:: 2016.2
-
-    This function requires OpenCL 2.1.
-    """
-
-    svm_pointers = _ffi.new('void *', len(svms))
-    sizes = _ffi.new('size_t', len(svms))
-
-    for i, svm in enumerate(svms):
-        if isinstance(svm, SVM):
-            svm = svm.mem
-
-        buf, size, _ = _c_buffer_from_obj(svm, writable=False)
-        svm_pointers[i] = buf
-        sizes[i] = size
-
-    ptr_event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _handle_error(_lib.enqueue_svm_memfill(
-        ptr_event, queue.ptr,
-        len(svms), svm_pointers, sizes, flags,
-        c_wait_for, num_wait_for))
-
-    return Event._create(ptr_event[0])
-
-# }}}
-
-
-# {{{ SVMMap
-
-class SVMMap(_CLKernelArg):
-    """
-    .. attribute:: event
-
-    .. versionadded:: 2016.2
-
-    .. automethod:: release
-
-    This class may also be used as a context manager in a ``with`` statement.
-    :meth:`release` will be called upon exit from the ``with`` region.
-    The value returned to the ``as`` part of the context manager is the
-    mapped Python object (e.g. a :mod:`numpy` array).
-    """
-    def __init__(self, svm, queue, event):
-        self.svm = svm
-        self.queue = queue
-        self.event = event
-
-    def __del__(self):
-        if self.svm is not None:
-            self.release()
-
-    def __enter__(self):
-        return self.svm.mem
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.release()
-
-    def release(self, queue=None, wait_for=None):
-        """
-        :arg queue: a :class:`pyopencl.CommandQueue`. Defaults to the one
-            with which the map was created, if not specified.
-        :returns: a :class:`pyopencl.Event`
-
-        |std-enqueue-blurb|
-        """
-
-        evt = self.svm._enqueue_unmap(self.queue)
-        self.svm = None
-
-        return evt
-
-# }}}
-
-
-# {{{ Program
-
-class CompilerWarning(UserWarning):
-    pass
-
-
-def compiler_output(text):
-    import os
-    from warnings import warn
-    if int(os.environ.get("PYOPENCL_COMPILER_OUTPUT", "0")):
-        warn(text, CompilerWarning)
-    else:
-        warn("Non-empty compiler output encountered. Set the "
-                "environment variable PYOPENCL_COMPILER_OUTPUT=1 "
-                "to see more.", CompilerWarning)
-
-
-class _Program(_Common):
-    _id = 'program'
-
-    def __init__(self, *args):
-        if len(args) == 2:
-            ctx, source = args
-            from pyopencl.tools import is_spirv
-            if is_spirv(source):
-                self._init_il(ctx, source)
-            else:
-                self._init_source(ctx, source)
-        else:
-            self._init_binary(*args)
-
-    def _init_source(self, context, src):
-        ptr_program = _ffi.new('clobj_t*')
-        _handle_error(_lib.create_program_with_source(
-            ptr_program, context.ptr, _to_cstring(src)))
-        self.ptr = ptr_program[0]
-
-    def _init_il(self, context, il):
-        ptr_program = _ffi.new('clobj_t*')
-        _handle_error(_lib.create_program_with_il(
-            ptr_program, context.ptr, il, len(il)))
-        self.ptr = ptr_program[0]
-
-    def _init_binary(self, context, devices, binaries):
-        if len(devices) != len(binaries):
-            raise RuntimeError("device and binary counts don't match",
-                               status_code.INVALID_VALUE,
-                               "create_program_with_binary")
-
-        ptr_program = _ffi.new('clobj_t*')
-        ptr_devices, num_devices = _clobj_list(devices)
-        ptr_binaries = [_ffi.new('unsigned char[]', binary)
-                        for binary in binaries]
-        binary_sizes = [len(b) for b in binaries]
-
-        # TODO parameter order? (for clobj_list)
-        _handle_error(_lib.create_program_with_binary(
-            ptr_program, context.ptr, num_devices, ptr_devices,
-            ptr_binaries, binary_sizes))
-
-        self.ptr = ptr_program[0]
-
-    def kind(self):
-        kind = _ffi.new('int*')
-        _handle_error(_lib.program__kind(self.ptr, kind))
-        return kind[0]
-
-    def _build(self, options=None, devices=None):
-        if options is None:
-            options = b""
-        # TODO? reverse parameter order
-        ptr_devices, num_devices = _clobj_list(devices)
-        _handle_error(_lib.program__build(self.ptr, options,
-                                          num_devices, ptr_devices))
-
-    def get_build_info(self, device, param):
-        info = _ffi.new('generic_info *')
-        _handle_error(_lib.program__get_build_info(
-            self.ptr, device.ptr, param, info))
-        return _generic_info_to_python(info)
-
-    def compile(self, options="", devices=None, headers=[]):
-        _devs, num_devs = _clobj_list(devices)
-        _prgs, names = list(zip(*((prg.ptr, _to_cstring(name))
-                             for (name, prg) in headers)))
-        _handle_error(_lib.program__compile(
-            self.ptr, _to_cstring(options), _devs, num_devs,
-            _prgs, names, len(names)))
-
-    @classmethod
-    def link(cls, context, programs, options="", devices=None):
-        _devs, num_devs = _clobj_list(devices)
-        _prgs, num_prgs = _clobj_list(programs)
-        _prg = _ffi.new('clobj_t*')
-        _handle_error(_lib.program__link(
-            _prg, context.ptr, _prgs, num_prgs, _to_cstring(options),
-            _devs, num_devs))
-        return cls._create(_prg[0])
-
-    @classmethod
-    def create_with_builtin_kernels(cls, context, devices, kernel_names):
-        _devs, num_devs = _clobj_list(devices)
-        _prg = _ffi.new('clobj_t*')
-        _handle_error(_lib.program__create_with_builtin_kernels(
-            _prg, context.ptr, _devs, num_devs, _to_cstring(kernel_names)))
-        return cls._create(_prg[0])
-
-    def all_kernels(self):
-        knls = _CArray(_ffi.new('clobj_t**'))
-        _handle_error(_lib.program__all_kernels(
-            self.ptr, knls.ptr, knls.size))
-        return [
-                Kernel
-                ._create(knls.ptr[0][i])
-                ._setup(self)
-                for i in range(knls.size[0])]
-
-    def _get_build_logs(self):
-        build_logs = []
-        for dev in self.get_info(program_info.DEVICES):
-            try:
-                log = self.get_build_info(dev, program_build_info.LOG)
-            except Exception:
-                log = "<error retrieving log>"
-
-            build_logs.append((dev, log))
-
-        return build_logs
-
-    def build(self, options_bytes, devices=None):
-        logger.debug("build program: start")
-        err = None
-        try:
-            self._build(options=options_bytes, devices=devices)
-        except Error as e:
-            msg = e.what + "\n\n" + (75 * "=" + "\n").join(
-                    "Build on %s:\n\n%s" % (dev, log)
-                    for dev, log in self._get_build_logs())
-            code = e.code
-            routine = e.routine
-
-            err = RuntimeError(
-                    Error._ErrorRecord(
-                        msg=msg,
-                        code=code,
-                        routine=routine))
-
-        if err is not None:
-            # Python 3.2 outputs the whole list of currently active exceptions
-            # This serves to remove one (redundant) level from that nesting.
-
-            logger.debug("build program: completed, error")
-            raise err
-
-        logger.debug("build program: completed, success")
-
-        message = (75 * "=" + "\n").join(
-                "Build on %s succeeded, but said:\n\n%s" % (dev, log)
-                for dev, log in self._get_build_logs()
-                if log is not None and log.strip())
-
-        if message:
-            if self.kind() == program_kind.SOURCE:
-                build_type = "From-source build"
-            elif self.kind() == program_kind.BINARY:
-                build_type = "From-binary build"
-            else:
-                build_type = "Build"
-
-            compiler_output("%s succeeded, but resulted in non-empty logs:\n%s"
-                    % (build_type, message))
-
-        return self
-
-# }}}
-
-
-class LocalMemory(_CLKernelArg):
-    __slots__ = ('_size',)
-
-    def __init__(self, size):
-        self._size = size
-
-    @property
-    def size(self):
-        return self._size
-
-
-# {{{ Kernel
-
-class Kernel(_Common):
-    _id = 'kernel'
-
-    def __init__(self, program, name):
-        if not isinstance(program, _Program):
-            program = program._get_prg()
-
-        ptr_kernel = _ffi.new('clobj_t*')
-        _handle_error(_lib.create_kernel(ptr_kernel, program.ptr,
-                                         _to_cstring(name)))
-        self.ptr = ptr_kernel[0]
-
-        self._setup(program)
-
-    def _setup(self, prg):
-        self._source = getattr(prg, "_source", None)
-
-        from pyopencl.invoker import generate_enqueue_and_set_args
-        self._enqueue, self._set_args = generate_enqueue_and_set_args(
-                self.function_name, self.num_args, self.num_args,
-                None,
-                warn_about_arg_count_bug=None,
-                work_around_arg_count_bug=None)
-
-        self._wg_info_cache = {}
-        return self
-
-    def set_scalar_arg_dtypes(self, scalar_arg_dtypes):
-        self._scalar_arg_dtypes = tuple(scalar_arg_dtypes)
-
-        # {{{ arg counting bug handling
-
-        # For example:
-        # https://github.com/pocl/pocl/issues/197
-        # (but Apple CPU has a similar bug)
-
-        work_around_arg_count_bug = False
-        warn_about_arg_count_bug = False
-
-        from pyopencl.characterize import has_struct_arg_count_bug
-
-        count_bug_per_dev = [
-                has_struct_arg_count_bug(dev, self.context)
-                for dev in self.context.devices]
-
-        from pytools import single_valued
-        if any(count_bug_per_dev):
-            if all(count_bug_per_dev):
-                work_around_arg_count_bug = single_valued(count_bug_per_dev)
-            else:
-                warn_about_arg_count_bug = True
-
-        # }}}
-
-        from pyopencl.invoker import generate_enqueue_and_set_args
-        self._enqueue, self._set_args = generate_enqueue_and_set_args(
-                self.function_name,
-                len(scalar_arg_dtypes), self.num_args,
-                self._scalar_arg_dtypes,
-                warn_about_arg_count_bug=warn_about_arg_count_bug,
-                work_around_arg_count_bug=work_around_arg_count_bug)
-
-    def set_args(self, *args, **kwargs):
-        # Need to duplicate the 'self' argument for dynamically generated  method
-        return self._set_args(self, *args, **kwargs)
-
-    def __call__(self, queue, global_size, local_size, *args, **kwargs):
-        # __call__ can't be overridden directly, so we need this
-        # trampoline hack.
-        return self._enqueue(self, queue, global_size, local_size, *args, **kwargs)
-
-    def capture_call(self, filename, queue, global_size, local_size,
-            *args, **kwargs):
-        from pyopencl.capture_call import capture_kernel_call
-        capture_kernel_call(self, filename, queue, global_size, local_size,
-                *args, **kwargs)
-
-    def _set_arg_clkernelarg(self, arg_index, arg):
-        if isinstance(arg, MemoryObjectHolder):
-            _handle_error(_lib.kernel__set_arg_mem(self.ptr, arg_index, arg.ptr))
-        elif isinstance(arg, SVM):
-            c_buf, _, _ = _c_buffer_from_obj(arg.mem)
-            _handle_error(_lib.kernel__set_arg_svm_pointer(
-                self.ptr, arg_index, c_buf))
-        elif isinstance(arg, Sampler):
-            _handle_error(_lib.kernel__set_arg_sampler(self.ptr, arg_index,
-                                                       arg.ptr))
-        elif isinstance(arg, LocalMemory):
-            _handle_error(_lib.kernel__set_arg_buf(self.ptr, arg_index,
-                                                   _ffi.NULL, arg.size))
-        else:
-            raise RuntimeError("unexpected _CLKernelArg subclass"
-                               "dimensions", status_code.INVALID_VALUE,
-                               "clSetKernelArg")
-
-    def set_arg(self, arg_index, arg):
-        # If you change this, also change the kernel call generation logic.
-        if arg is None:
-            _handle_error(_lib.kernel__set_arg_null(self.ptr, arg_index))
-        elif isinstance(arg, _CLKernelArg):
-            self._set_arg_clkernelarg(arg_index, arg)
-        elif _CPY2 and isinstance(arg, np.generic):
-            # https://github.com/numpy/numpy/issues/5381
-            c_buf, size, _ = _c_buffer_from_obj(np.getbuffer(arg))
-            _handle_error(_lib.kernel__set_arg_buf(self.ptr, arg_index,
-                                                   c_buf, size))
-        else:
-            c_buf, size, _ = _c_buffer_from_obj(arg)
-            _handle_error(_lib.kernel__set_arg_buf(self.ptr, arg_index,
-                                                   c_buf, size))
-
-    def get_work_group_info(self, param, device):
-        try:
-            return self._wg_info_cache[param, device]
-        except KeyError:
-            pass
-
-        info = _ffi.new('generic_info*')
-        _handle_error(_lib.kernel__get_work_group_info(
-            self.ptr, param, device.ptr, info))
-        result = _generic_info_to_python(info)
-
-        self._wg_info_cache[param, device] = result
-        return result
-
-    def get_arg_info(self, idx, param):
-        info = _ffi.new('generic_info*')
-        _handle_error(_lib.kernel__get_arg_info(self.ptr, idx, param, info))
-        return _generic_info_to_python(info)
-
-# }}}
-
-
-# {{{ Event
-
-class Event(_Common):
-    _id = 'event'
-
-    def __init__(self):
-        pass
-
-    def get_profiling_info(self, param):
-        info = _ffi.new('generic_info *')
-        _handle_error(_lib.event__get_profiling_info(self.ptr, param, info))
-        return _generic_info_to_python(info)
-
-    def wait(self):
-        _handle_error(_lib.event__wait(self.ptr))
-
-    def set_callback(self, _type, cb):
-        def _func(status):
-            cb(status)
-        _handle_error(_lib.event__set_callback(self.ptr, _type,
-                                               _ffi.new_handle(_func)))
-
-
-class ProfilingInfoGetter:
-    def __init__(self, event):
-        self.event = event
-
-    def __getattr__(self, name):
-        info_cls = profiling_info
-
-        try:
-            inf_attr = getattr(info_cls, name.upper())
-        except AttributeError:
-            raise AttributeError("%s has no attribute '%s'"
-                    % (type(self), name))
-        else:
-            return self.event.get_profiling_info(inf_attr)
-
-
-Event.profile = property(ProfilingInfoGetter)
-
-
-def wait_for_events(wait_for):
-    if wait_for is None or len(wait_for) == 0:
-        return
-    _handle_error(_lib.wait_for_events(*_clobj_list(wait_for)))
-
-
-class NannyEvent(Event):
-    class _Data(object):
-        __slots__ = ('ward', 'ref')
-
-        def __init__(self, ward, ref):
-            self.ward = ward
-            self.ref = ref
-
-    @classmethod
-    def _handle(cls, ward, ref=None):
-        return _ffi.new_handle(cls._Data(ward, ref))
-
-    def get_ward(self):
-        _handle = _lib.nanny_event__get_ward(self.ptr)
-        if _handle == _ffi.NULL:
-            return
-        return _ffi.from_handle(_handle).ward
-
-
-class UserEvent(Event):
-    def __init__(self, ctx):
-        _evt = _ffi.new('clobj_t*')
-        _handle_error(_lib.create_user_event(_evt, ctx.ptr))
-        self.ptr = _evt[0]
-
-    def set_status(self, status):
-        _handle_error(_lib.user_event__set_status(self.ptr, status))
-
-# }}}
-
-
-# {{{ enqueue_nd_range_kernel
-
-def enqueue_nd_range_kernel(queue, kernel, global_work_size, local_work_size,
-                            global_work_offset=None, wait_for=None,
-                            g_times_l=False):
-
-    work_dim = len(global_work_size)
-
-    if local_work_size is not None:
-        if g_times_l:
-            work_dim = max(work_dim, len(local_work_size))
-        elif work_dim != len(local_work_size):
-            raise RuntimeError("global/local work sizes have differing "
-                               "dimensions", status_code.INVALID_VALUE,
-                               "enqueue_nd_range_kernel")
-
-        if len(local_work_size) < work_dim:
-            local_work_size = (local_work_size +
-                               (1,) * (work_dim - len(local_work_size)))
-        if len(global_work_size) < work_dim:
-            global_work_size = (global_work_size +
-                                (1,) * (work_dim - len(global_work_size)))
-        if g_times_l:
-            global_work_size = tuple(
-                    global_work_size[i] * local_work_size[i]
-                    for i in range(work_dim))
-
-    c_global_work_offset = _ffi.NULL
-    if global_work_offset is not None:
-        if work_dim != len(global_work_offset):
-            raise RuntimeError("global work size and offset have differing "
-                               "dimensions", status_code.INVALID_VALUE,
-                               "enqueue_nd_range_kernel")
-
-        c_global_work_offset = global_work_offset
-
-    if local_work_size is None:
-        local_work_size = _ffi.NULL
-
-    ptr_event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _handle_error(_lib.enqueue_nd_range_kernel(
-        ptr_event, queue.ptr, kernel.ptr, work_dim, c_global_work_offset,
-        global_work_size, local_work_size, c_wait_for, num_wait_for))
-    return Event._create(ptr_event[0])
-
-# }}}
-
-
-# {{{ enqueue_task
-
-def enqueue_task(queue, kernel, wait_for=None):
-    _event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _handle_error(_lib.enqueue_task(
-        _event, queue.ptr, kernel.ptr, c_wait_for, num_wait_for))
-    return Event._create(_event[0])
-
-# }}}
-
-
-# {{{ _enqueue_marker_*
-
-def _enqueue_marker_with_wait_list(queue, wait_for=None):
-    ptr_event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _handle_error(_lib.enqueue_marker_with_wait_list(
-        ptr_event, queue.ptr, c_wait_for, num_wait_for))
-    return Event._create(ptr_event[0])
-
-
-def _enqueue_marker(queue):
-    ptr_event = _ffi.new('clobj_t*')
-    _handle_error(_lib.enqueue_marker(ptr_event, queue.ptr))
-    return Event._create(ptr_event[0])
-
-# }}}
-
-
-# {{{ _enqueue_barrier_*
-
-def _enqueue_barrier_with_wait_list(queue, wait_for=None):
-    ptr_event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _handle_error(_lib.enqueue_barrier_with_wait_list(
-        ptr_event, queue.ptr, c_wait_for, num_wait_for))
-    return Event._create(ptr_event[0])
-
-
-def _enqueue_barrier(queue):
-    _handle_error(_lib.enqueue_barrier(queue.ptr))
-
-# }}}
-
-
-# {{{ enqueue_migrate_mem_object*
-
-def enqueue_migrate_mem_objects(queue, mem_objects, flags, wait_for=None):
-    _event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    c_mem_objs, num_mem_objs = _clobj_list(mem_objects)
-    _handle_error(_lib.enqueue_migrate_mem_objects(
-        _event, queue.ptr, c_mem_objs, num_mem_objs, flags,
-        c_wait_for, num_wait_for))
-    return Event._create(_event[0])
-
-
-def enqueue_migrate_mem_object_ext(queue, mem_objects, flags, wait_for=None):
-    _event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    c_mem_objs, num_mem_objs = _clobj_list(mem_objects)
-    _handle_error(_lib.enqueue_migrate_mem_object_ext(
-        _event, queue.ptr, c_mem_objs, num_mem_objs, flags,
-        c_wait_for, num_wait_for))
-    return Event._create(_event[0])
-
-# }}}
-
-
-# {{{ _enqueue_wait_for_events
-
-def _enqueue_wait_for_events(queue, wait_for=None):
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _handle_error(_lib.enqueue_wait_for_events(queue.ptr, c_wait_for,
-                                               num_wait_for))
-
-# }}}
-
-
-# {{{ _enqueue_*_buffer
-
-def _enqueue_read_buffer(queue, mem, hostbuf, device_offset=0,
-                         wait_for=None, is_blocking=True):
-    c_buf, size, _ = _c_buffer_from_obj(hostbuf, writable=True)
-    ptr_event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _handle_error(_lib.enqueue_read_buffer(
-        ptr_event, queue.ptr, mem.ptr, c_buf, size, device_offset,
-        c_wait_for, num_wait_for, bool(is_blocking),
-        NannyEvent._handle(hostbuf)))
-    return NannyEvent._create(ptr_event[0])
-
-
-def _enqueue_write_buffer(queue, mem, hostbuf, device_offset=0,
-                          wait_for=None, is_blocking=True):
-    c_buf, size, c_ref = _c_buffer_from_obj(hostbuf, retain=True)
-    ptr_event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _handle_error(_lib.enqueue_write_buffer(
-        ptr_event, queue.ptr, mem.ptr, c_buf, size, device_offset,
-        c_wait_for, num_wait_for, bool(is_blocking),
-        NannyEvent._handle(hostbuf, c_ref)))
-    return NannyEvent._create(ptr_event[0])
-
-
-def _enqueue_copy_buffer(queue, src, dst, byte_count=-1, src_offset=0,
-                         dst_offset=0, wait_for=None):
-    ptr_event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _handle_error(_lib.enqueue_copy_buffer(
-        ptr_event, queue.ptr, src.ptr, dst.ptr, byte_count, src_offset,
-        dst_offset, c_wait_for, num_wait_for))
-    return Event._create(ptr_event[0])
-
-
-def _enqueue_read_buffer_rect(queue, mem, hostbuf, buffer_origin,
-                              host_origin, region, buffer_pitches=None,
-                              host_pitches=None, wait_for=None,
-                              is_blocking=True):
-    buffer_origin = tuple(buffer_origin)
-    host_origin = tuple(host_origin)
-    region = tuple(region)
-    if buffer_pitches is None:
-        buffer_pitches = _ffi.NULL
-        buffer_pitches_l = 0
-    else:
-        buffer_pitches = tuple(buffer_pitches)
-        buffer_pitches_l = len(buffer_pitches)
-    if host_pitches is None:
-        host_pitches = _ffi.NULL
-        host_pitches_l = 0
-    else:
-        host_pitches = tuple(host_pitches)
-        host_pitches_l = len(host_pitches)
-
-    buffer_origin_l = len(buffer_origin)
-    host_origin_l = len(host_origin)
-    region_l = len(region)
-    if (buffer_origin_l > 3 or host_origin_l > 3 or region_l > 3 or
-            buffer_pitches_l > 2 or host_pitches_l > 2):
-        raise RuntimeError("(buffer/host)_origin, (buffer/host)_pitches or "
-                           "region has too many components",
-                           status_code.INVALID_VALUE,
-                           "enqueue_read_buffer_rect")
-    c_buf, size, _ = _c_buffer_from_obj(hostbuf, writable=True)
-    _event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _handle_error(_lib.enqueue_read_buffer_rect(
-        _event, queue.ptr, mem.ptr, c_buf, buffer_origin, buffer_origin_l,
-        host_origin, host_origin_l, region, region_l, buffer_pitches,
-        buffer_pitches_l, host_pitches, host_pitches_l, c_wait_for,
-        num_wait_for, bool(is_blocking), NannyEvent._handle(hostbuf)))
-    return NannyEvent._create(_event[0])
-
-
-def _enqueue_write_buffer_rect(queue, mem, hostbuf, buffer_origin,
-                               host_origin, region, buffer_pitches=None,
-                               host_pitches=None, wait_for=None,
-                               is_blocking=True):
-    buffer_origin = tuple(buffer_origin)
-    host_origin = tuple(host_origin)
-    region = tuple(region)
-    if buffer_pitches is None:
-        buffer_pitches = _ffi.NULL
-        buffer_pitches_l = 0
-    else:
-        buffer_pitches = tuple(buffer_pitches)
-        buffer_pitches_l = len(buffer_pitches)
-    if host_pitches is None:
-        host_pitches = _ffi.NULL
-        host_pitches_l = 0
-    else:
-        host_pitches = tuple(host_pitches)
-        host_pitches_l = len(host_pitches)
-
-    buffer_origin_l = len(buffer_origin)
-    host_origin_l = len(host_origin)
-    region_l = len(region)
-    if (buffer_origin_l > 3 or host_origin_l > 3 or region_l > 3 or
-            buffer_pitches_l > 2 or host_pitches_l > 2):
-        raise RuntimeError("(buffer/host)_origin, (buffer/host)_pitches or "
-                           "region has too many components",
-                           status_code.INVALID_VALUE,
-                           "enqueue_write_buffer_rect")
-    c_buf, size, c_ref = _c_buffer_from_obj(hostbuf, retain=True)
-    _event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _handle_error(_lib.enqueue_write_buffer_rect(
-        _event, queue.ptr, mem.ptr, c_buf, buffer_origin, buffer_origin_l,
-        host_origin, host_origin_l, region, region_l, buffer_pitches,
-        buffer_pitches_l, host_pitches, host_pitches_l, c_wait_for,
-        num_wait_for, bool(is_blocking), NannyEvent._handle(hostbuf, c_ref)))
-    return NannyEvent._create(_event[0])
-
-
-def _enqueue_copy_buffer_rect(queue, src, dst, src_origin, dst_origin, region,
-                              src_pitches=None, dst_pitches=None,
-                              wait_for=None):
-    src_origin = tuple(src_origin)
-    dst_origin = tuple(dst_origin)
-    region = tuple(region)
-    if src_pitches is None:
-        src_pitches = _ffi.NULL
-        src_pitches_l = 0
-    else:
-        src_pitches = tuple(src_pitches)
-        src_pitches_l = len(src_pitches)
-    if dst_pitches is None:
-        dst_pitches = _ffi.NULL
-        dst_pitches_l = 0
-    else:
-        dst_pitches = tuple(dst_pitches)
-        dst_pitches_l = len(dst_pitches)
-    src_origin_l = len(src_origin)
-    dst_origin_l = len(dst_origin)
-    region_l = len(region)
-    if (src_origin_l > 3 or dst_origin_l > 3 or region_l > 3 or
-            src_pitches_l > 2 or dst_pitches_l > 2):
-        raise RuntimeError("(src/dst)_origin, (src/dst)_pitches or "
-                           "region has too many components",
-                           status_code.INVALID_VALUE,
-                           "enqueue_copy_buffer_rect")
-    _event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _handle_error(_lib.enqueue_copy_buffer_rect(
-        _event, queue.ptr, src.ptr, dst.ptr, src_origin, src_origin_l,
-        dst_origin, dst_origin_l, region, region_l, src_pitches,
-        src_pitches_l, dst_pitches, dst_pitches_l, c_wait_for, num_wait_for))
-    return Event._create(_event[0])
-
-
-# PyPy bug report: https://bitbucket.org/pypy/pypy/issue/1777/unable-to-create-proper-numpy-array-from  # noqa
-def enqueue_map_buffer(queue, buf, flags, offset, shape, dtype,
-                       order="C", strides=None, wait_for=None,
-                       is_blocking=True):
-    dtype, shape, strides = _norm_shape_dtype(shape, dtype, order, strides,
-                                              'enqueue_map_buffer')
-    byte_size = dtype.itemsize
-    for s in shape:
-        byte_size *= s
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _event = _ffi.new('clobj_t*')
-    _map = _ffi.new('clobj_t*')
-    _handle_error(_lib.enqueue_map_buffer(_event, _map, queue.ptr, buf.ptr,
-                                          flags, offset, byte_size, c_wait_for,
-                                          num_wait_for, bool(is_blocking)))
-    mmap = MemoryMap._create(_map[0], shape, dtype.str, strides)
-    ary = np.asarray(mmap)
-    ary.dtype = dtype
-
-    return (ary, Event._create(_event[0]))
-
-
-def _enqueue_fill_buffer(queue, mem, pattern, offset, size, wait_for=None):
-    c_pattern, psize, c_ref = _c_buffer_from_obj(pattern)
-    _event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _handle_error(_lib.enqueue_fill_buffer(
-        _event, queue.ptr, mem.ptr, c_pattern, psize, offset, size,
-        c_wait_for, num_wait_for))
-    return Event._create(_event[0])
-
-# }}}
-
-
-# {{{ _enqueue_*_image
-
-def _enqueue_read_image(queue, mem, origin, region, hostbuf, row_pitch=0,
-                        slice_pitch=0, wait_for=None, is_blocking=True):
-    origin = tuple(origin)
-    region = tuple(region)
-    origin_l = len(origin)
-    region_l = len(region)
-    if origin_l > 3 or region_l > 3:
-        raise RuntimeError("origin or region has too many components",
-                           status_code.INVALID_VALUE, "enqueue_read_image")
-    c_buf, size, _ = _c_buffer_from_obj(hostbuf, writable=True)
-    ptr_event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    # TODO check buffer size
-    _handle_error(_lib.enqueue_read_image(
-        ptr_event, queue.ptr, mem.ptr, origin, origin_l, region, region_l,
-        c_buf, row_pitch, slice_pitch, c_wait_for, num_wait_for,
-        bool(is_blocking), NannyEvent._handle(hostbuf)))
-    return NannyEvent._create(ptr_event[0])
-
-
-def _enqueue_copy_image(queue, src, dest, src_origin, dest_origin, region,
-                        wait_for=None):
-    src_origin = tuple(src_origin)
-    region = tuple(region)
-    src_origin_l = len(src_origin)
-    dest_origin_l = len(dest_origin)
-    region_l = len(region)
-    if src_origin_l > 3 or dest_origin_l > 3 or region_l > 3:
-        raise RuntimeError("(src/dest)origin or region has too many components",
-                           status_code.INVALID_VALUE, "enqueue_copy_image")
-    _event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _handle_error(_lib.enqueue_copy_image(
-        _event, queue.ptr, src.ptr, dest.ptr, src_origin, src_origin_l,
-        dest_origin, dest_origin_l, region, region_l, c_wait_for, num_wait_for))
-    return Event._create(_event[0])
-
-
-def _enqueue_write_image(queue, mem, origin, region, hostbuf, row_pitch=0,
-                         slice_pitch=0, wait_for=None, is_blocking=True):
-    origin = tuple(origin)
-    region = tuple(region)
-    origin_l = len(origin)
-    region_l = len(region)
-    if origin_l > 3 or region_l > 3:
-        raise RuntimeError("origin or region has too many components",
-                           status_code.INVALID_VALUE, "enqueue_write_image")
-    c_buf, size, c_ref = _c_buffer_from_obj(hostbuf, retain=True)
-    _event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    # TODO: check buffer size
-    _handle_error(_lib.enqueue_write_image(
-        _event, queue.ptr, mem.ptr, origin, origin_l, region, region_l,
-        c_buf, row_pitch, slice_pitch, c_wait_for, num_wait_for,
-        bool(is_blocking), NannyEvent._handle(hostbuf, c_ref)))
-    return NannyEvent._create(_event[0])
-
-
-def enqueue_map_image(queue, img, flags, origin, region, shape, dtype,
-                      order="C", strides=None, wait_for=None, is_blocking=True):
-    origin = tuple(origin)
-    region = tuple(region)
-    origin_l = len(origin)
-    region_l = len(region)
-    if origin_l > 3 or region_l > 3:
-        raise RuntimeError("origin or region has too many components",
-                           status_code.INVALID_VALUE, "enqueue_map_image")
-    dtype, shape, strides = _norm_shape_dtype(shape, dtype, order, strides,
-                                              'enqueue_map_image')
-    _event = _ffi.new('clobj_t*')
-    _map = _ffi.new('clobj_t*')
-    _row_pitch = _ffi.new('size_t*')
-    _slice_pitch = _ffi.new('size_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _handle_error(_lib.enqueue_map_image(_event, _map, queue.ptr, img.ptr,
-                                         flags, origin, origin_l, region,
-                                         region_l, _row_pitch, _slice_pitch,
-                                         c_wait_for, num_wait_for, is_blocking))
-    mmap = MemoryMap._create(_map[0], shape, dtype.str, strides)
-    ary = np.asarray(mmap)
-    ary.dtype = dtype
-    return (ary, Event._create(_event[0]), _row_pitch[0], _slice_pitch[0])
-
-
-def enqueue_fill_image(queue, img, color, origin, region, wait_for=None):
-    origin = tuple(origin)
-    region = tuple(region)
-    origin_l = len(origin)
-    region_l = len(region)
-    color_l = len(color)
-    if origin_l > 3 or region_l > 3 or color_l > 4:
-        raise RuntimeError("origin, region or color has too many components",
-                           status_code.INVALID_VALUE, "enqueue_fill_image")
-    color = np.array(color).astype(img._fill_type)
-    c_color = _ffi.cast('void*', color.__array_interface__['data'][0])
-    _event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _handle_error(_lib.enqueue_fill_image(_event, queue.ptr, img.ptr,
-                                          c_color, origin, origin_l, region,
-                                          region_l, c_wait_for, num_wait_for))
-    return Event._create(_event[0])
-
-
-def _enqueue_copy_image_to_buffer(queue, src, dest, origin, region, offset,
-                                  wait_for=None):
-    origin = tuple(origin)
-    region = tuple(region)
-    origin_l = len(origin)
-    region_l = len(region)
-    if origin_l > 3 or region_l > 3:
-        raise RuntimeError("origin or region has too many components",
-                           status_code.INVALID_VALUE,
-                           "enqueue_copy_image_to_buffer")
-    _event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _handle_error(_lib.enqueue_copy_image_to_buffer(
-        _event, queue.ptr, src.ptr, dest.ptr, origin, origin_l, region,
-        region_l, offset, c_wait_for, num_wait_for))
-    return Event._create(_event[0])
-
-
-def _enqueue_copy_buffer_to_image(queue, src, dest, offset, origin, region,
-                                  wait_for=None):
-    origin = tuple(origin)
-    region = tuple(region)
-    origin_l = len(origin)
-    region_l = len(region)
-    if origin_l > 3 or region_l > 3:
-        raise RuntimeError("origin or region has too many components",
-                           status_code.INVALID_VALUE,
-                           "enqueue_copy_buffer_to_image")
-    _event = _ffi.new('clobj_t*')
-    c_wait_for, num_wait_for = _clobj_list(wait_for)
-    _handle_error(_lib.enqueue_copy_buffer_to_image(
-        _event, queue.ptr, src.ptr, dest.ptr, offset, origin, origin_l,
-        region, region_l, c_wait_for, num_wait_for))
-    return Event._create(_event[0])
-
-# }}}
-
-
-# {{{ gl interop
-
-def have_gl():
-    return bool(_lib.have_gl())
-
-
-class _GLObject(object):
-    def get_gl_object_info(self):
-        otype = _ffi.new('cl_gl_object_type*')
-        gl_name = _ffi.new('GLuint*')
-        _handle_error(_lib.get_gl_object_info(self.ptr, otype, gl_name))
-        return otype[0], gl_name[0]
-
-
-class GLBuffer(MemoryObject, _GLObject):
-    _id = 'gl_buffer'
-
-    def __init__(self, context, flags, bufobj):
-        MemoryObject.__init__(self)
-        ptr = _ffi.new('clobj_t*')
-        _handle_error(_lib.create_from_gl_buffer(
-            ptr, context.ptr, flags, bufobj))
-        self.ptr = ptr[0]
-
-
-class GLRenderBuffer(MemoryObject, _GLObject):
-    _id = 'gl_renderbuffer'
-
-    def __init__(self, context, flags, bufobj):
-        MemoryObject.__init__(self, bufobj)
-        c_buf, bufsize, retained = self._handle_buf_flags(flags)
-        ptr = _ffi.new('clobj_t*')
-        _handle_error(_lib.create_from_gl_renderbuffer(
-            ptr, context.ptr, flags, c_buf))
-        self.ptr = ptr[0]
-
-
-def _create_gl_enqueue(what):
-    def enqueue_gl_objects(queue, mem_objects, wait_for=None):
-        ptr_event = _ffi.new('clobj_t*')
-        c_wait_for, num_wait_for = _clobj_list(wait_for)
-        c_mem_objects, num_mem_objects = _clobj_list(mem_objects)
-        _handle_error(what(ptr_event, queue.ptr, c_mem_objects, num_mem_objects,
-                           c_wait_for, num_wait_for))
-        return Event._create(ptr_event[0])
-    return enqueue_gl_objects
-
-
-if _lib.have_gl():
-    enqueue_acquire_gl_objects = _create_gl_enqueue(
-        _lib.enqueue_acquire_gl_objects)
-    enqueue_release_gl_objects = _create_gl_enqueue(
-        _lib.enqueue_release_gl_objects)
-    try:
-        get_apple_cgl_share_group = _lib.get_apple_cgl_share_group
-    except AttributeError:
-        pass
-
-# }}}
-
-
-def _cffi_property(_name=None, read=True, write=True):
-    def _deco(get_ptr):
-        name = _name if _name else get_ptr.__name__
-        return property((lambda self: getattr(get_ptr(self), name)) if read
-                        else (lambda self: None),
-                        (lambda self, v: setattr(get_ptr(self), name, v))
-                        if write else (lambda self, v: None))
-    return _deco
-
-
-# {{{ ImageFormat
-
-class ImageFormat(object):
-    # Hack around fmt.__dict__ check in test_wrapper.py
-    __dict__ = {}
-    __slots__ = ('ptr',)
-
-    def __init__(self, channel_order=0, channel_type=0):
-        self.ptr = _ffi.new("cl_image_format*")
-        self.channel_order = channel_order
-        self.channel_data_type = channel_type
-
-    @_cffi_property('image_channel_order')
-    def channel_order(self):
-        return self.ptr
-
-    @_cffi_property('image_channel_data_type')
-    def channel_data_type(self):
-        return self.ptr
-
-    @property
-    def channel_count(self):
-        try:
-            return {
-                channel_order.R: 1,
-                channel_order.A: 1,
-                channel_order.RG: 2,
-                channel_order.RA: 2,
-                channel_order.RGB: 3,
-                channel_order.RGBA: 4,
-                channel_order.BGRA: 4,
-                channel_order.INTENSITY: 1,
-                channel_order.LUMINANCE: 1,
-            }[self.channel_order]
-        except KeyError:
-            raise LogicError("unrecognized channel order",
-                             status_code.INVALID_VALUE,
-                             "ImageFormat.channel_count")
-
-    @property
-    def dtype_size(self):
-        try:
-            return {
-                channel_type.SNORM_INT8: 1,
-                channel_type.SNORM_INT16: 2,
-                channel_type.UNORM_INT8: 1,
-                channel_type.UNORM_INT16: 2,
-                channel_type.UNORM_SHORT_565: 2,
-                channel_type.UNORM_SHORT_555: 2,
-                channel_type.UNORM_INT_101010: 4,
-                channel_type.SIGNED_INT8: 1,
-                channel_type.SIGNED_INT16: 2,
-                channel_type.SIGNED_INT32: 4,
-                channel_type.UNSIGNED_INT8: 1,
-                channel_type.UNSIGNED_INT16: 2,
-                channel_type.UNSIGNED_INT32: 4,
-                channel_type.HALF_FLOAT: 2,
-                channel_type.FLOAT: 4,
-            }[self.channel_data_type]
-        except KeyError:
-            raise LogicError("unrecognized channel data type",
-                             status_code.INVALID_VALUE,
-                             "ImageFormat.channel_dtype_size")
-
-    @property
-    def itemsize(self):
-        return self.channel_count * self.dtype_size
-
-    def __repr__(self):
-        return "ImageFormat(%s, %s)" % (
-                channel_order.to_string(self.channel_order,
-                    "<unknown channel order 0x%x>"),
-                channel_type.to_string(self.channel_data_type,
-                    "<unknown channel data type 0x%x>"))
-
-    def __eq__(self, other):
-        return (self.channel_order == other.channel_order
-                and self.channel_data_type == other.channel_data_type)
-
-    def __ne__(self, other):
-        return not self.__eq__(other)
-
-    def __hash__(self):
-        return hash((type(self), self.channel_order, self.channel_data_type))
-
-
-def get_supported_image_formats(context, flags, image_type):
-    info = _ffi.new('generic_info*')
-    _handle_error(_lib.context__get_supported_image_formats(
-        context.ptr, flags, image_type, info))
-    return _generic_info_to_python(info)
-
-# }}}
-
-
-# {{{ ImageDescriptor
-
-def _write_only_property(*arg):
-    return property().setter(*arg)
-
-
-class ImageDescriptor(object):
-    __slots__ = ('ptr',)
-
-    def __init__(self):
-        self.ptr = _ffi.new("cl_image_desc*")
-
-    @_cffi_property()
-    def image_type(self):
-        return self.ptr
-
-    @_cffi_property('image_array_size')
-    def array_size(self):
-        return self.ptr
-
-    @_cffi_property()
-    def num_mip_levels(self):
-        return self.ptr
-
-    @_cffi_property()
-    def num_samples(self):
-        return self.ptr
-
-    @_write_only_property
-    def shape(self, shape):
-        sdims = len(shape)
-        if sdims > 3:
-            raise LogicError("shape has too many components",
-                             status_code.INVALID_VALUE, "transfer")
-        desc = self.ptr
-        desc.image_width = shape[0] if sdims > 0 else 1
-        desc.image_height = shape[1] if sdims > 1 else 1
-        desc.image_depth = shape[2] if sdims > 2 else 1
-        desc.image_array_size = desc.image_depth
-
-    @_write_only_property
-    def pitches(self, pitches):
-        pdims = len(pitches)
-        if pdims > 2:
-            raise LogicError("pitches has too many components",
-                             status_code.INVALID_VALUE, "transfer")
-        desc = self.ptr
-        desc.image_row_pitch = pitches[0] if pdims > 0 else 1
-        desc.image_slice_pitch = pitches[1] if pdims > 1 else 1
-
-    @_write_only_property
-    def buffer(self, buff):
-        self.ptr.buffer = buff.ptr.int_ptr if buff else _ffi.NULL
-
-# }}}
-
-
-# {{{ Image
-
-_int_dtype = ({
-    8: np.int64,
-    4: np.int32,
-    2: np.int16,
-    1: np.int8,
-})[_ffi.sizeof('int')]
-
-_uint_dtype = ({
-    8: np.uint64,
-    4: np.uint32,
-    2: np.uint16,
-    1: np.uint8,
-})[_ffi.sizeof('unsigned')]
-
-_float_dtype = ({
-    8: np.float64,
-    4: np.float32,
-    2: np.float16,
-})[_ffi.sizeof('float')]
-
-_fill_dtype_dict = {
-    _lib.TYPE_INT: _int_dtype,
-    _lib.TYPE_UINT: _uint_dtype,
-    _lib.TYPE_FLOAT: _float_dtype,
-    }
-
-
-class Image(MemoryObject):
-    _id = 'image'
-
-    def __init_dispatch(self, *args):
-        if len(args) == 5:
-            # >= 1.2
-            self.__init_1_2(*args)
-        elif len(args) == 6:
-            # <= 1.1
-            self.__init_legacy(*args)
-        else:
-            assert False
-        self._fill_type = _fill_dtype_dict[_lib.image__get_fill_type(self.ptr)]
-
-    def __init_1_2(self, context, flags, fmt, desc, hostbuf):
-        MemoryObject.__init__(self, hostbuf)
-        c_buf, size, retained_buf = self._handle_buf_flags(flags)
-        ptr = _ffi.new('clobj_t*')
-        _handle_error(_lib.create_image_from_desc(ptr, context.ptr, flags,
-                                                  fmt.ptr, desc.ptr, c_buf))
-        self.ptr = ptr[0]
-
-    def __init_legacy(self, context, flags, fmt, shape, pitches, hostbuf):
-        if shape is None:
-            raise LogicError("'shape' must be given",
-                             status_code.INVALID_VALUE, "Image")
-        MemoryObject.__init__(self, hostbuf)
-        c_buf, size, retained_buf = self._handle_buf_flags(flags)
-        dims = len(shape)
-        if dims == 2:
-            width, height = shape
-            pitch = 0
-            if pitches is not None:
-                try:
-                    pitch, = pitches
-                except ValueError:
-                    raise LogicError("invalid length of pitch tuple",
-                                     status_code.INVALID_VALUE, "Image")
-
-            # check buffer size
-            if (hostbuf is not None and
-                    max(pitch, width * fmt.itemsize) * height > size):
-                raise LogicError("buffer too small",
-                                 status_code.INVALID_VALUE, "Image")
-
-            ptr = _ffi.new('clobj_t*')
-            _handle_error(_lib.create_image_2d(ptr, context.ptr, flags, fmt.ptr,
-                                               width, height, pitch, c_buf))
-            self.ptr = ptr[0]
-        elif dims == 3:
-            width, height, depth = shape
-            pitch_x, pitch_y = 0, 0
-            if pitches is not None:
-                try:
-                    pitch_x, pitch_y = pitches
-                except ValueError:
-                    raise LogicError("invalid length of pitch tuple",
-                                     status_code.INVALID_VALUE, "Image")
-
-            # check buffer size
-            if (hostbuf is not None and
-                (max(max(pitch_x, width * fmt.itemsize) *
-                     height, pitch_y) * depth > size)):
-                raise LogicError("buffer too small",
-                                 status_code.INVALID_VALUE, "Image")
-
-            ptr = _ffi.new('clobj_t*')
-            _handle_error(_lib.create_image_3d(
-                ptr, context.ptr, flags, fmt.ptr,
-                width, height, depth, pitch_x, pitch_y, c_buf))
-
-            self.ptr = ptr[0]
-        else:
-            raise LogicError("invalid dimension",
-                             status_code.INVALID_VALUE, "Image")
-
-    def __init__(self, context, flags, format, shape=None, pitches=None,
-            hostbuf=None, is_array=False, buffer=None):
-
-        if shape is None and hostbuf is None:
-            raise Error("'shape' must be passed if 'hostbuf' is not given")
-
-        if shape is None and hostbuf is not None:
-            shape = hostbuf.shape
-
-        if hostbuf is not None and not \
-                (flags & (mem_flags.USE_HOST_PTR | mem_flags.COPY_HOST_PTR)):
-            from warnings import warn
-            warn("'hostbuf' was passed, but no memory flags to make use of it.")
-
-        if hostbuf is None and pitches is not None:
-            raise Error("'pitches' may only be given if 'hostbuf' is given")
-
-        if context._get_cl_version() >= (1, 2) and get_cl_header_version() >= (1, 2):
-            if buffer is not None and is_array:
-                    raise ValueError(
-                            "'buffer' and 'is_array' are mutually exclusive")
-
-            if len(shape) == 3:
-                if buffer is not None:
-                    raise TypeError(
-                            "'buffer' argument is not supported for 3D arrays")
-                elif is_array:
-                    image_type = mem_object_type.IMAGE2D_ARRAY
-                else:
-                    image_type = mem_object_type.IMAGE3D
-
-            elif len(shape) == 2:
-                if buffer is not None:
-                    raise TypeError(
-                            "'buffer' argument is not supported for 2D arrays")
-                elif is_array:
-                    image_type = mem_object_type.IMAGE1D_ARRAY
-                else:
-                    image_type = mem_object_type.IMAGE2D
-
-            elif len(shape) == 1:
-                if buffer is not None:
-                    image_type = mem_object_type.IMAGE1D_BUFFER
-                elif is_array:
-                    raise TypeError("array of zero-dimensional images not supported")
-                else:
-                    image_type = mem_object_type.IMAGE1D
-
-            else:
-                raise ValueError("images cannot have more than three dimensions")
-
-            desc = ImageDescriptor()
-
-            desc.image_type = image_type
-            desc.shape = shape  # also sets desc.array_size
-
-            if pitches is None:
-                desc.pitches = (0, 0)
-            else:
-                desc.pitches = pitches
-
-            desc.num_mip_levels = 0  # per CL 1.2 spec
-            desc.num_samples = 0  # per CL 1.2 spec
-            desc.buffer = buffer
-
-            self.__init_dispatch(context, flags, format, desc, hostbuf)
-        else:
-            # legacy init for CL 1.1 and older
-            if is_array:
-                raise TypeError("'is_array=True' is not supported for CL < 1.2")
-            # if num_mip_levels is not None:
-                # raise TypeError(
-                #       "'num_mip_levels' argument is not supported for CL < 1.2")
-            # if num_samples is not None:
-                # raise TypeError(
-                #        "'num_samples' argument is not supported for CL < 1.2")
-            if buffer is not None:
-                raise TypeError("'buffer' argument is not supported for CL < 1.2")
-
-            self.__init_dispatch(context, flags, format, shape,
-                    pitches, hostbuf)
-
-    def get_image_info(self, param):
-        info = _ffi.new('generic_info*')
-        _handle_error(_lib.image__get_image_info(self.ptr, param, info))
-        return _generic_info_to_python(info)
-
-    @property
-    def shape(self):
-        if self.type == mem_object_type.IMAGE2D:
-            return (self.width, self.height)
-        elif self.type == mem_object_type.IMAGE3D:
-            return (self.width, self.height, self.depth)
-        else:
-            raise LogicError("only images have shapes")
-
-
-class _ImageInfoGetter:
-    def __init__(self, event):
-        from warnings import warn
-        warn("Image.image.attr is deprecated. "
-                "Use Image.attr directly, instead.")
-
-        self.event = event
-
-    def __getattr__(self, name):
-        try:
-            inf_attr = getattr(image_info, name.upper())
-        except AttributeError:
-            raise AttributeError("%s has no attribute '%s'"
-                    % (type(self), name))
-        else:
-            return self.event.get_image_info(inf_attr)
-
-
-Image.image = property(_ImageInfoGetter)
-
-# }}}
-
-
-# {{{ Sampler
-
-class Sampler(_Common, _CLKernelArg):
-    _id = 'sampler'
-
-    def __init__(self, context, normalized_coords, addressing_mode, filter_mode):
-        ptr = _ffi.new('clobj_t*')
-        _handle_error(_lib.create_sampler(
-            ptr, context.ptr, normalized_coords, addressing_mode, filter_mode))
-        self.ptr = ptr[0]
-
-# }}}
-
-
-# {{{ GLTexture
-
-class GLTexture(Image, _GLObject):
-    _id = 'gl_texture'
-
-    def __init__(self, context, flags, texture_target, miplevel, texture, dims=None):
-        ptr = _ffi.new('clobj_t*')
-        _handle_error(_lib.create_from_gl_texture(
-            ptr, context.ptr, flags, texture_target, miplevel, texture))
-        self.ptr = ptr[0]
-
-# }}}
-
-
-# {{{ DeviceTopologyAmd
-
-class DeviceTopologyAmd(object):
-    # Hack around fmt.__dict__ check in test_wrapper.py
-    __dict__ = {}
-    __slots__ = ('ptr',)
-
-    def __init__(self, bus=0, device=0, function=0):
-        self.ptr = _ffi.new("cl_device_topology_amd*")
-        self.bus = bus
-        self.device = device
-        self.function = function
-
-    def _check_range(self, value, prop=None):
-        if (value < -127) or (value > 127):
-            raise ValueError("Value %s not in range [-127, 127].")
-
-    @_cffi_property('pcie')
-    def _pcie(self):
-        return self.ptr
-
-    @property
-    def bus(self):
-        return self._pcie.bus
-
-    @bus.setter
-    def bus(self, value):
-        self._check_range(value)
-        self._pcie.bus = value
-
-    @property
-    def device(self):
-        return self._pcie.device
-
-    @device.setter
-    def device(self, value):
-        self._pcie.device = value
-
-    @property
-    def function(self):
-        return self._pcie.function
-
-    @function.setter
-    def function(self, value):
-        self._pcie.function = value
-
-# }}}
-
-
-# {{{ get_info monkeypatchery
-
-def add_get_info_attrs(cls, info_method, info_class, cacheable_attrs=None):
-    if cacheable_attrs is None:
-        cacheable_attrs = []
-
-    def make_getinfo(info_method, info_name, info_attr):
-        def result(self):
-            return info_method(self, info_attr)
-
-        return property(result)
-
-    def make_cacheable_getinfo(info_method, info_name, cache_attr, info_attr):
-        def result(self):
-            try:
-                return getattr(self, cache_attr)
-            except AttributeError:
-                pass
-
-            result = info_method(self, info_attr)
-            setattr(self, cache_attr, result)
-            return result
-
-        return property(result)
-
-    for info_name, info_value in six.iteritems(info_class.__dict__):
-        if info_name == "to_string" or info_name.startswith("_"):
-            continue
-
-        info_lower = info_name.lower()
-        info_constant = getattr(info_class, info_name)
-        if info_name in cacheable_attrs:
-            cache_attr = intern("_info_cache_" + info_lower)
-            setattr(cls, info_lower, make_cacheable_getinfo(
-                info_method, info_lower, cache_attr, info_constant))
-        else:
-            setattr(cls, info_lower, make_getinfo(
-                    info_method, info_name, info_constant))
-
-
-add_get_info_attrs(Platform, Platform.get_info, platform_info),
-add_get_info_attrs(Device, Device.get_info, device_info,
-                ["PLATFORM", "MAX_WORK_GROUP_SIZE", "MAX_COMPUTE_UNITS"])
-add_get_info_attrs(Context, Context.get_info, context_info)
-add_get_info_attrs(CommandQueue, CommandQueue.get_info, command_queue_info,
-                ["CONTEXT", "DEVICE"])
-add_get_info_attrs(Event, Event.get_info, event_info)
-add_get_info_attrs(MemoryObjectHolder, MemoryObjectHolder.get_info, mem_info)
-add_get_info_attrs(Image, Image.get_image_info, image_info)
-add_get_info_attrs(Kernel, Kernel.get_info, kernel_info)
-add_get_info_attrs(Sampler, Sampler.get_info, sampler_info)
-
-# }}}
-
-
-if have_gl():
-    def gl_object_get_gl_object(self):
-        return self.get_gl_object_info()[1]
-
-    GLBuffer.gl_object = property(gl_object_get_gl_object)
-    GLTexture.gl_object = property(gl_object_get_gl_object)
-
-# vim: foldmethod=marker
diff --git a/pyopencl/characterize/__init__.py b/pyopencl/characterize/__init__.py
index 26a4a688bba94c3576fc45f9e98fd8e6ef0a6e63..873e1c11c834b4c9b0dfa28837440f318b3a7b21 100644
--- a/pyopencl/characterize/__init__.py
+++ b/pyopencl/characterize/__init__.py
@@ -387,3 +387,33 @@ def has_struct_arg_count_bug(dev, ctx=None):
             return "pocl"
 
     return False
+
+
+def _may_have_svm(dev):
+    has_svm = (dev.platform._get_cl_version() >= (2, 0) and
+                cl.get_cl_header_version() >= (2, 0))
+
+    if dev.platform.name == "Portable Computing Language":
+        has_svm = (
+                get_pocl_version(dev.platform) >= (1, 0)
+                and cl.get_cl_header_version() >= (2, 0))
+
+    return has_svm
+
+
+def has_coarse_grain_buffer_svm(dev):
+    return (_may_have_svm(dev) and
+            bool(dev.svm_capabilities
+                & cl.device_svm_capabilities.COARSE_GRAIN_BUFFER))
+
+
+def has_fine_grain_buffer_svm(dev):
+    return (_may_have_svm(dev) and
+            bool(dev.svm_capabilities
+                & cl.device_svm_capabilities.FINE_GRAIN_BUFFER))
+
+
+def has_fine_grain_system_svm(dev):
+    return (_may_have_svm(dev) and
+            bool(dev.svm_capabilities
+                & cl.device_svm_capabilities.FINE_GRAIN_SYSTEM))
diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py
index 8cad3f258c0036f24fd5f95e34b3b512d4f61542..b580c5375e298ff5d5864c52cebd656af42eac89 100644
--- a/pyopencl/invoker.py
+++ b/pyopencl/invoker.py
@@ -28,7 +28,7 @@ import sys
 import numpy as np
 
 from warnings import warn
-from pyopencl._cffi import ffi as _ffi
+import pyopencl._cl as _cl
 from pytools.persistent_dict import WriteOncePersistentDict
 from pyopencl.tools import _NumpyTypesKeyBuilder
 
@@ -44,7 +44,7 @@ _size_t_char = ({
     4: 'L',
     2: 'H',
     1: 'B',
-})[_ffi.sizeof('size_t')]
+})[_cl._sizeof_size_t()]
 _type_char_map = {
     'n': _size_t_char.lower(),
     'N': _size_t_char
@@ -59,27 +59,24 @@ del _size_t_char
 def generate_buffer_arg_setter(gen, arg_idx, buf_var):
     from pytools.py_codegen import Indentation
 
-    if _CPY2:
+    if _CPY2 or _PYPY:
         # https://github.com/numpy/numpy/issues/5381
         gen("if isinstance({buf_var}, np.generic):".format(buf_var=buf_var))
         with Indentation(gen):
-            gen("{buf_var} = np.getbuffer({buf_var})".format(buf_var=buf_var))
+            if _PYPY:
+                gen("{buf_var} = np.asarray({buf_var})".format(buf_var=buf_var))
+            else:
+                gen("{buf_var} = np.getbuffer({buf_var})".format(buf_var=buf_var))
 
     gen("""
-        c_buf, sz, _ = _cl._c_buffer_from_obj({buf_var})
-        status = _lib.kernel__set_arg_buf(self.ptr, {arg_idx}, c_buf, sz)
-        if status != _ffi.NULL:
-            _handle_error(status)
+        self._set_arg_buf({arg_idx}, {buf_var})
         """
         .format(arg_idx=arg_idx, buf_var=buf_var))
 
 
 def generate_bytes_arg_setter(gen, arg_idx, buf_var):
     gen("""
-        status = _lib.kernel__set_arg_buf(self.ptr, {arg_idx},
-            {buf_var}, len({buf_var}))
-        if status != _ffi.NULL:
-            _handle_error(status)
+        self._set_arg_buf({arg_idx}, {buf_var})
         """
         .format(arg_idx=arg_idx, buf_var=buf_var))
 
@@ -89,11 +86,9 @@ def generate_generic_arg_handler(gen, arg_idx, arg_var):
 
     gen("""
         if {arg_var} is None:
-            status = _lib.kernel__set_arg_null(self.ptr, {arg_idx})
-            if status != _ffi.NULL:
-                _handle_error(status)
-        elif isinstance({arg_var}, _cl._CLKernelArg):
-            self._set_arg_clkernelarg({arg_idx}, {arg_var})
+            self._set_arg_null({arg_idx})
+        elif isinstance({arg_var}, _KERNEL_ARG_CLASSES):
+            self.set_arg({arg_idx}, {arg_var})
         """
         .format(arg_idx=arg_idx, arg_var=arg_var))
 
@@ -289,10 +284,8 @@ def wrap_in_error_handler(body, arg_names):
 
 def add_local_imports(gen):
     gen("import numpy as np")
-    gen("import pyopencl.cffi_cl as _cl")
-    gen(
-        "from pyopencl.cffi_cl import _lib, "
-        "_ffi, _handle_error, _CLKernelArg")
+    gen("import pyopencl._cl as _cl")
+    gen("from pyopencl import _KERNEL_ARG_CLASSES")
     gen("")
 
 
@@ -359,7 +352,7 @@ def _generate_enqueue_and_set_args_module(function_name,
 
 
 invoker_cache = WriteOncePersistentDict(
-        "pyopencl-invoker-cache-v1",
+        "pyopencl-invoker-cache-v6",
         key_builder=_NumpyTypesKeyBuilder())
 
 
diff --git a/pyopencl/mempool.py b/pyopencl/mempool.py
deleted file mode 100644
index 6b1740ec3bedec01047d29bae52c53deee0edb21..0000000000000000000000000000000000000000
--- a/pyopencl/mempool.py
+++ /dev/null
@@ -1,275 +0,0 @@
-from __future__ import division
-from __future__ import absolute_import
-import six
-
-__copyright__ = """
-Copyright (C) 2014 Andreas Kloeckner
-"""
-
-__license__ = """
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-"""
-
-
-import numpy as np
-import pyopencl as cl
-from pyopencl.tools import bitlog2
-
-
-# {{{ allocators
-
-class AllocatorBase(object):
-    def __call__(self, nbytes):
-        try_count = 0
-
-        while try_count < 2:
-            try:
-                return self.allocate(nbytes)
-            except cl.Error as e:
-                if not e.is_out_of_memory():
-                    raise
-                try_count += 1
-                if try_count == 2:
-                    raise
-
-            self.try_release_blocks()
-
-    def try_release_blocks(self):
-        import gc
-        gc.collect()
-
-    def free(self, buf):
-        buf.release()
-
-
-class DeferredAllocator(AllocatorBase):
-    is_deferred = True
-
-    def __init__(self, context, mem_flags=cl.mem_flags.READ_WRITE):
-        self.context = context
-        self.mem_flags = mem_flags
-
-    def allocate(self, nbytes):
-        return cl.Buffer(self.context, self.mem_flags, nbytes)
-
-
-_zero = np.array([0, 0, 0, 0], dtype=np.int8)
-
-
-class ImmediateAllocator(AllocatorBase):
-    is_deferred = False
-
-    def __init__(self, queue, mem_flags=cl.mem_flags.READ_WRITE):
-        self.context = queue.context
-        self.queue = queue
-        self.mem_flags = mem_flags
-
-    def allocate(self, nbytes):
-        buf = cl.Buffer(self.context, self.mem_flags, nbytes)
-
-        # Make sure the buffer gets allocated right here and right now.
-        # This looks (and is) expensive. But immediate allocators
-        # have their main use in memory pools, whose basic assumption
-        # is that allocation is too expensive anyway--but they rely
-        # on exact 'out-of-memory' information.
-
-        from pyopencl.cffi_cl import _enqueue_write_buffer
-        _enqueue_write_buffer(
-                self.queue, buf,
-                _zero[:min(len(_zero), nbytes)],
-                is_blocking=False)
-
-        # No need to wait for completion here. clWaitForEvents (e.g.)
-        # cannot return mem object allocation failures. This implies that
-        # the buffer is faulted onto the device on enqueue.
-
-        return buf
-
-# }}}
-
-
-# {{{ memory pool
-
-class MemoryPool(object):
-    mantissa_bits = 2
-    mantissa_mask = (1 << mantissa_bits) - 1
-
-    def __init__(self, allocator):
-        self.allocator = allocator
-
-        self.bin_nr_to_bin = {}
-
-        if self.allocator.is_deferred:
-            from warnings import warn
-            warn("Memory pools expect non-deferred "
-                    "semantics from their allocators. You passed a deferred "
-                    "allocator, i.e. an allocator whose allocations can turn out to "
-                    "be unavailable long after allocation.", statcklevel=2)
-
-        self.active_blocks = 0
-
-        self.stop_holding_flag = False
-
-    @classmethod
-    def bin_number(cls, size):
-        bl2 = bitlog2(size)
-
-        mantissa_bits = cls.mantissa_bits
-        if bl2 >= mantissa_bits:
-            shifted = size >> (bl2 - mantissa_bits)
-        else:
-            shifted = size << (mantissa_bits - bl2)
-
-        assert not (size and (shifted & (1 << mantissa_bits)) == 0)
-
-        chopped = shifted & cls.mantissa_mask
-
-        return bl2 << mantissa_bits | chopped
-
-    @classmethod
-    def alloc_size(cls, bin_nr):
-        mantissa_bits = cls.mantissa_bits
-
-        exponent = bin_nr >> mantissa_bits
-        mantissa = bin_nr & cls.mantissa_mask
-
-        exp_minus_mbits = exponent-mantissa_bits
-        if exp_minus_mbits >= 0:
-            ones = (1 << exp_minus_mbits) - 1
-            head = ((1 << mantissa_bits) | mantissa) << exp_minus_mbits
-        else:
-            ones = 0
-            head = ((1 << mantissa_bits) | mantissa) >> -exp_minus_mbits
-
-        assert not (ones & head)
-        return head | ones
-
-    def stop_holding(self):
-        self.stop_holding_flag = True
-        self.free_held()
-
-    def free_held(self):
-        for bin_nr, bin_list in six.iteritems(self.bin_nr_to_bin):
-            while bin_list:
-                self.allocator.free(bin_list.pop())
-
-    @property
-    def held_blocks(self):
-        return sum(
-                len(bin_list)
-                for bin_list in six.itervalues(self.bin_nr_to_bin))
-
-    def allocate(self, size):
-        bin_nr = self.bin_number(size)
-        bin_list = self.bin_nr_to_bin.setdefault(bin_nr, [])
-
-        alloc_sz = self.alloc_size(bin_nr)
-
-        if bin_list:
-            # if (m_trace)
-            #   std::cout
-            #     << "[pool] allocation of size " << size
-            #     << " served from bin " << bin_nr
-            #     << " which contained " << bin_list.size()
-            #     << " entries" << std::endl;
-            self.active_blocks += 1
-            return PooledBuffer(self, bin_list.pop(), alloc_sz)
-
-        assert self.bin_number(alloc_sz) == bin_nr
-
-        # if (m_trace)
-        #   std::cout << "[pool] allocation of size " << size
-        #   << " required new memory" << std::endl;
-
-        try:
-            result = self.allocator(alloc_sz)
-            self.active_blocks += 1
-            return PooledBuffer(self, result, alloc_sz)
-        except cl.MemoryError:
-            pass
-
-        # if (m_trace)
-        #   std::cout << "[pool] allocation triggered OOM, running GC" << std::endl;
-
-        self.allocator.try_release_blocks()
-
-        if bin_list:
-            return bin_list.pop()
-
-        # if (m_trace)
-        #   std::cout << "[pool] allocation still OOM after GC" << std::endl;
-
-        for _ in self._try_to_free_memory():
-            try:
-                result = self.allocator(alloc_sz)
-                self.active_blocks += 1
-                return PooledBuffer(self, result, alloc_sz)
-            except cl.MemoryError:
-                pass
-
-        raise cl.MemoryError(
-                "failed to free memory for allocation",
-                routine="memory_pool::allocate",
-                code=cl.status_code.MEM_OBJECT_ALLOCATION_FAILURE)
-
-    __call__ = allocate
-
-    def free(self, buf, size):
-        self.active_blocks -= 1
-        bin_nr = self.bin_number(size)
-
-        if not self.stop_holding_flag:
-            self.bin_nr_to_bin.setdefault(bin_nr, []).append(buf)
-
-            # if (m_trace)
-            #   std::cout << "[pool] block of size " << size << " returned to bin "
-            #     << bin_nr << " which now contains " << get_bin(bin_nr).size()
-            #     << " entries" << std::endl;
-        else:
-            self.allocator.free(buf)
-
-    def _try_to_free_memory(self):
-        for bin_nr, bin_list in six.iteritems(self.bin_nr_to_bin):
-            while bin_list:
-                self.allocator.free(bin_list.pop())
-                yield
-
-
-class PooledBuffer(cl.MemoryObjectHolder):
-    _id = 'buffer'
-
-    def __init__(self, pool, buf, alloc_sz):
-        self.pool = pool
-        self.buf = buf
-        self.ptr = buf.ptr
-        self._alloc_sz = alloc_sz
-
-    def release(self):
-        self.pool.free(self.buf, self._alloc_sz)
-        self.buf = None
-        self.ptr = None
-
-    def __del__(self):
-        if self.buf is not None:
-            self.release()
-
-# }}}
-
-
-# vim: foldmethod=marker
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index a3c577ef88854042b249c6d83651147af9ae298f..05ccc5d079cbf42c5fe415adc190c845a15bfcac 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -35,7 +35,7 @@ import numpy as np
 from decorator import decorator
 import pyopencl as cl
 from pytools import memoize, memoize_method
-from pyopencl.cffi_cl import _lib
+from pyopencl._cl import bitlog2  # noqa: F401
 from pytools.persistent_dict import KeyBuilder as KeyBuilderBase
 
 import re
@@ -60,9 +60,11 @@ _register_types()
 
 # {{{ imported names
 
-bitlog2 = _lib.bitlog2
-from pyopencl.mempool import (  # noqa
-        PooledBuffer, DeferredAllocator, ImmediateAllocator, MemoryPool)
+from pyopencl._cl import (  # noqa
+        PooledBuffer as PooledBuffer,
+        _tools_DeferredAllocator as DeferredAllocator,
+        _tools_ImmediateAllocator as ImmediateAllocator,
+        MemoryPool as MemoryPool)
 
 # }}}
 
diff --git a/pyopencl/version.py b/pyopencl/version.py
index ddb2bc1439c4102c64a1eb14ca4ed146ea241dd3..f46939dfb75cb4e66e1c297fd8a5837099f2034f 100644
--- a/pyopencl/version.py
+++ b/pyopencl/version.py
@@ -1,3 +1,3 @@
-VERSION = (2018, 1, 1)
+VERSION = (2018, 2)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
diff --git a/setup.py b/setup.py
index 1c9ca77d0b1a05716ce20caaa09f3e7ac0cbde4f..fcf668bf7532b683605da9ceb34ed5cdf215a9d6 100644
--- a/setup.py
+++ b/setup.py
@@ -31,6 +31,82 @@ THE SOFTWARE.
 
 import sys
 from os.path import exists
+import setuptools
+from setuptools.command.build_ext import build_ext
+
+
+# {{{ boilerplate from https://github.com/pybind/python_example/blob/2ed5a68759cd6ff5d2e5992a91f08616ef457b5c/setup.py  # noqa
+
+class get_pybind_include(object):  # noqa: N801
+    """Helper class to determine the pybind11 include path
+
+    The purpose of this class is to postpone importing pybind11
+    until it is actually installed, so that the ``get_include()``
+    method can be invoked. """
+
+    def __init__(self, user=False):
+        self.user = user
+
+    def __str__(self):
+        import pybind11
+        return pybind11.get_include(self.user)
+
+
+# As of Python 3.6, CCompiler has a `has_flag` method.
+# cf http://bugs.python.org/issue26689
+def has_flag(compiler, flagname):
+    """Return a boolean indicating whether a flag name is supported on
+    the specified compiler.
+    """
+    import tempfile
+    with tempfile.NamedTemporaryFile('w', suffix='.cpp') as f:
+        f.write('int main (int argc, char **argv) { return 0; }')
+        try:
+            compiler.compile([f.name], extra_postargs=[flagname])
+        except setuptools.distutils.errors.CompileError:
+            return False
+    return True
+
+
+def cpp_flag(compiler):
+    """Return the -std=c++[11/14] compiler flag.
+
+    The c++14 is prefered over c++11 (when it is available).
+    """
+    if has_flag(compiler, '-std=c++14'):
+        return '-std=c++14'
+    elif has_flag(compiler, '-std=c++11'):
+        return '-std=c++11'
+    else:
+        raise RuntimeError('Unsupported compiler -- at least C++11 support '
+                           'is needed!')
+
+
+class BuildExt(build_ext):
+    """A custom build extension for adding compiler-specific options."""
+    c_opts = {
+        'msvc': ['/EHsc'],
+        'unix': [],
+    }
+
+    if sys.platform == 'darwin':
+        c_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.7']
+
+    def build_extensions(self):
+        ct = self.compiler.compiler_type
+        opts = self.c_opts.get(ct, [])
+        if ct == 'unix':
+            opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version())
+            opts.append(cpp_flag(self.compiler))
+            if has_flag(self.compiler, '-fvisibility=hidden'):
+                opts.append('-fvisibility=hidden')
+        elif ct == 'msvc':
+            opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version())
+        for ext in self.extensions:
+            ext.extra_compile_args = opts
+        build_ext.build_extensions(self)
+
+# }}}
 
 
 def get_config_schema():
@@ -38,7 +114,11 @@ def get_config_schema():
             IncludeDir, LibraryDir, Libraries, \
             Switch, StringListOption
 
-    default_cxxflags = ['-std=gnu++11']
+    default_cxxflags = [
+            # Required for pybind11:
+            # https://pybind11.readthedocs.io/en/stable/faq.html#someclass-declared-with-greater-visibility-than-the-type-of-its-field-someclass-member-wattributes
+            "-fvisibility=hidden"
+            ]
 
     if 'darwin' in sys.platform:
         import platform
@@ -100,7 +180,7 @@ def get_config_schema():
 def main():
     from setuptools import find_packages
     from aksetup_helper import (hack_distutils, get_config, setup,
-            check_git_submodules)
+            check_git_submodules, NumpyExtension)
     check_git_submodules()
 
     hack_distutils()
@@ -133,6 +213,8 @@ def main():
 
     conf["EXTRA_DEFINES"] = extra_defines
 
+    INCLUDE_DIRS = conf["CL_INC_DIR"] + ["pybind11/include"]  # noqa: N806
+
     ver_dic = {}
     version_file = open("pyopencl/version.py")
     try:
@@ -181,22 +263,6 @@ def main():
         print("https://pypi.python.org/pypi/pyopencl")
         sys.exit(1)
 
-    # {{{ write cffi build script
-
-    with open("cffi_build.py.in", "rt") as f:
-        build_script_template = f.read()
-
-    format_args = {}
-    for k, v in conf.items():
-        format_args[k] = repr(v)
-
-    build_script = build_script_template.format(**format_args)
-
-    with open("cffi_build.py", "wt") as f:
-        f.write(build_script)
-
-    # }}}
-
     setup(name="pyopencl",
             # metadata
             version=ver_dic["VERSION_TEXT"],
@@ -217,7 +283,6 @@ def main():
                 'Programming Language :: C++',
                 'Programming Language :: Python',
                 'Programming Language :: Python :: 2',
-                'Programming Language :: Python :: 2.6',
                 'Programming Language :: Python :: 2.7',
                 'Programming Language :: Python :: 3',
                 'Programming Language :: Python :: 3.2',
@@ -230,24 +295,43 @@ def main():
             # build info
             packages=find_packages(),
 
+            ext_modules=[
+                NumpyExtension("pyopencl._cl",
+                    [
+                        "src/wrap_constants.cpp",
+                        "src/wrap_cl.cpp",
+                        "src/wrap_cl_part_1.cpp",
+                        "src/wrap_cl_part_2.cpp",
+                        "src/wrap_mempool.cpp",
+                        "src/bitlog.cpp",
+                        ],
+                    include_dirs=INCLUDE_DIRS + [
+                        get_pybind_include(),
+                        get_pybind_include(user=True)
+                        ],
+                    library_dirs=conf["CL_LIB_DIR"],
+                    libraries=conf["CL_LIBNAME"],
+                    define_macros=list(conf["EXTRA_DEFINES"].items()),
+                    extra_compile_args=conf["CXXFLAGS"],
+                    extra_link_args=conf["LDFLAGS"],
+                    language='c++',
+                    ),
+                ],
+
             setup_requires=[
+                "pybind11",
                 "numpy",
-                "cffi>=1.1.0",
                 ],
 
             install_requires=[
                 "numpy",
                 "pytools>=2017.6",
-                "pytest>=2",
                 "decorator>=3.2.0",
-                "cffi>=1.1.0",
                 "appdirs>=1.4.0",
                 "six>=1.9.0",
                 # "Mako>=0.3.6",
                 ],
 
-            cffi_modules=["cffi_build.py:ffi"],
-
             include_package_data=True,
             package_data={
                     "pyopencl": [
@@ -258,8 +342,11 @@ def main():
                         ]
                     },
 
+            cmdclass={'build_ext': BuildExt},
             zip_safe=False)
 
 
 if __name__ == '__main__':
     main()
+
+# vim: foldmethod=marker
diff --git a/src/bitlog.cpp b/src/bitlog.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..88b820fa362668f9af11a31c8913dbeb03052e94
--- /dev/null
+++ b/src/bitlog.cpp
@@ -0,0 +1,27 @@
+#include "bitlog.hpp"
+
+
+
+
+/* from http://graphics.stanford.edu/~seander/bithacks.html */
+const char pyopencl::log_table_8[] =
+{
+  0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+};
+
+
diff --git a/src/bitlog.hpp b/src/bitlog.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e3ffbe01fe0ae00df25102246922deaa8dbf8b2e
--- /dev/null
+++ b/src/bitlog.hpp
@@ -0,0 +1,46 @@
+// Base-2 logarithm bithack.
+
+#ifndef _AFJDFJSDFSD_PYOPENCL_HEADER_SEEN_BITLOG_HPP
+#define _AFJDFJSDFSD_PYOPENCL_HEADER_SEEN_BITLOG_HPP
+
+
+#include <climits>
+#include <cstdint>
+
+
+namespace pyopencl
+{
+  extern const char log_table_8[];
+
+  inline unsigned bitlog2_16(uint16_t v)
+  {
+    if (unsigned long t = v >> 8)
+      return 8+log_table_8[t];
+    else 
+      return log_table_8[v];
+  }
+
+  inline unsigned bitlog2_32(uint32_t v)
+  {
+    if (uint16_t t = v >> 16)
+      return 16+bitlog2_16(t);
+    else 
+      return bitlog2_16(v);
+  }
+
+  inline unsigned bitlog2(unsigned long v)
+  {
+#if (ULONG_MAX != 4294967295)
+    if (uint32_t t = v >> 32)
+      return 32+bitlog2_32(t);
+    else 
+#endif
+      return bitlog2_32(v);
+  }
+}
+
+
+
+
+
+#endif
diff --git a/src/c_wrapper/bitlog.cpp b/src/c_wrapper/bitlog.cpp
deleted file mode 100644
index 418eb4d8f8f5ad8b2b15131b9821e9d4cb612509..0000000000000000000000000000000000000000
--- a/src/c_wrapper/bitlog.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-#include "wrap_cl.h"
-#include "function.h"
-
-#include <climits>
-#include <stdint.h>
-
-/* from http://graphics.stanford.edu/~seander/bithacks.html */
-static const char log_table_8[] = {
-    0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
-    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
-    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
-};
-
-static PYOPENCL_INLINE unsigned
-bitlog2_16(uint16_t v)
-{
-    if (unsigned long t = v >> 8) {
-        return 8 + log_table_8[t];
-    } else {
-        return log_table_8[v];
-    }
-}
-
-static PYOPENCL_INLINE unsigned
-bitlog2_32(uint32_t v)
-{
-    if (uint16_t t = v >> 16) {
-        return 16 + bitlog2_16(t);
-    } else {
-      return bitlog2_16(v);
-    }
-}
-
-unsigned
-bitlog2(unsigned long v)
-{
-#if (ULONG_MAX != 4294967295)
-    if (uint32_t t = v >> 32) {
-        return 32 + bitlog2_32(t);
-    } else {
-#endif
-        return bitlog2_32(v);
-#if (ULONG_MAX != 4294967295)
-    }
-#endif
-}
diff --git a/src/c_wrapper/buffer.cpp b/src/c_wrapper/buffer.cpp
deleted file mode 100644
index 70e1ff3ed1074f5ecdb9f046f70e15672e2fd9b6..0000000000000000000000000000000000000000
--- a/src/c_wrapper/buffer.cpp
+++ /dev/null
@@ -1,235 +0,0 @@
-#include <algorithm>
-#include "buffer.h"
-#include "context.h"
-#include "command_queue.h"
-#include "event.h"
-
-template void print_clobj<buffer>(std::ostream&, const buffer*);
-
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE buffer*
-new_buffer(cl_mem mem)
-{
-    return pyopencl_convert_obj(buffer, clReleaseMemObject, mem);
-}
-
-#if PYOPENCL_CL_VERSION >= 0x1010
-PYOPENCL_USE_RESULT buffer*
-buffer::get_sub_region(size_t orig, size_t size, cl_mem_flags flags) const
-{
-    cl_buffer_region reg = {orig, size};
-
-    auto mem = retry_mem_error([&] {
-            return pyopencl_call_guarded(clCreateSubBuffer, PYOPENCL_CL_CASTABLE_THIS, flags,
-                                         CL_BUFFER_CREATE_TYPE_REGION, &reg);
-        });
-    return new_buffer(mem);
-}
-
-#endif
-
-// c wrapper
-
-// Buffer
-error*
-create_buffer(clobj_t *buffer, clobj_t _ctx, cl_mem_flags flags,
-              size_t size, void *hostbuf)
-{
-    auto ctx = static_cast<context*>(_ctx);
-    return c_handle_retry_mem_error([&] {
-            auto mem = pyopencl_call_guarded(clCreateBuffer, ctx,
-                                             flags, size, hostbuf);
-            *buffer = new_buffer(mem);
-        });
-}
-
-error*
-enqueue_read_buffer(clobj_t *evt, clobj_t _queue, clobj_t _mem,
-                    void *buffer, size_t size, size_t device_offset,
-                    const clobj_t *_wait_for, uint32_t num_wait_for,
-                    int block, void *pyobj)
-{
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    auto queue = static_cast<command_queue*>(_queue);
-    auto mem = static_cast<memory_object*>(_mem);
-    return c_handle_retry_mem_error([&] {
-            pyopencl_call_guarded(
-                clEnqueueReadBuffer, queue, mem, bool(block), device_offset,
-                size, buffer, wait_for, nanny_event_out(evt, pyobj));
-        });
-}
-
-error*
-enqueue_write_buffer(clobj_t *evt, clobj_t _queue, clobj_t _mem,
-                     const void *buffer, size_t size, size_t device_offset,
-                     const clobj_t *_wait_for, uint32_t num_wait_for,
-                     int block, void *pyobj)
-{
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    auto queue = static_cast<command_queue*>(_queue);
-    auto mem = static_cast<memory_object*>(_mem);
-    return c_handle_retry_mem_error([&] {
-            pyopencl_call_guarded(
-                clEnqueueWriteBuffer, queue, mem, bool(block), device_offset,
-                size, buffer, wait_for, nanny_event_out(evt, pyobj));
-        });
-}
-
-error*
-enqueue_copy_buffer(clobj_t *evt, clobj_t _queue, clobj_t _src, clobj_t _dst,
-                    ptrdiff_t byte_count, size_t src_offset, size_t dst_offset,
-                    const clobj_t *_wait_for, uint32_t num_wait_for)
-{
-    auto queue = static_cast<command_queue*>(_queue);
-    auto src = static_cast<memory_object*>(_src);
-    auto dst = static_cast<memory_object*>(_dst);
-    return c_handle_error([&] {
-            if (byte_count < 0) {
-                size_t byte_count_src = 0;
-                size_t byte_count_dst = 0;
-                pyopencl_call_guarded(
-                    clGetMemObjectInfo, src, CL_MEM_SIZE,
-                    sizeof(byte_count), &byte_count_src, nullptr);
-                pyopencl_call_guarded(
-                    clGetMemObjectInfo, src, CL_MEM_SIZE,
-                    sizeof(byte_count), &byte_count_dst, nullptr);
-                byte_count = std::min(byte_count_src, byte_count_dst);
-            }
-            const auto wait_for = buf_from_class<event>(_wait_for,
-                                                        num_wait_for);
-            retry_mem_error([&] {
-                    pyopencl_call_guarded(
-                        clEnqueueCopyBuffer, queue, src, dst, src_offset,
-                        dst_offset, byte_count, wait_for, event_out(evt));
-                });
-        });
-}
-
-
-error*
-enqueue_fill_buffer(clobj_t *evt, clobj_t _queue, clobj_t _mem, void *pattern,
-                    size_t psize, size_t offset, size_t size,
-                    const clobj_t *_wait_for, uint32_t num_wait_for)
-{
-#if PYOPENCL_CL_VERSION >= 0x1020
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    auto queue = static_cast<command_queue*>(_queue);
-    auto mem = static_cast<memory_object*>(_mem);
-    // TODO debug print pattern
-    return c_handle_retry_mem_error([&] {
-            pyopencl_call_guarded(clEnqueueFillBuffer, queue, mem, pattern,
-                                  psize, offset, size, wait_for,
-                                  event_out(evt));
-        });
-#else
-    PYOPENCL_UNSUPPORTED(clEnqueueFillBuffer, "CL 1.1 and below")
-#endif
-}
-
-
-// {{{ rectangular transfers
-
-error*
-enqueue_read_buffer_rect(clobj_t *evt, clobj_t _queue, clobj_t _mem, void *buf,
-                         const size_t *_buf_orig, size_t buf_orig_l,
-                         const size_t *_host_orig, size_t host_orig_l,
-                         const size_t *_reg, size_t reg_l,
-                         const size_t *_buf_pitches, size_t buf_pitches_l,
-                         const size_t *_host_pitches, size_t host_pitches_l,
-                         const clobj_t *_wait_for, uint32_t num_wait_for,
-                         int block, void *pyobj)
-{
-#if PYOPENCL_CL_VERSION >= 0x1010
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    auto queue = static_cast<command_queue*>(_queue);
-    auto mem = static_cast<memory_object*>(_mem);
-    ConstBuffer<size_t, 3> buf_orig(_buf_orig, buf_orig_l);
-    ConstBuffer<size_t, 3> host_orig(_host_orig, host_orig_l);
-    ConstBuffer<size_t, 3> reg(_reg, reg_l, 1);
-    ConstBuffer<size_t, 2> buf_pitches(_buf_pitches, buf_pitches_l);
-    ConstBuffer<size_t, 2> host_pitches(_host_pitches, host_pitches_l);
-    return c_handle_retry_mem_error([&] {
-            pyopencl_call_guarded(
-                clEnqueueReadBufferRect, queue, mem, bool(block), buf_orig,
-                host_orig, reg, buf_pitches[0], buf_pitches[1], host_pitches[0],
-                host_pitches[1], buf, wait_for, nanny_event_out(evt, pyobj));
-        });
-#else
-    PYOPENCL_UNSUPPORTED(clEnqueueReadBufferRect, "CL 1.0")
-#endif
-}
-
-error*
-enqueue_write_buffer_rect(clobj_t *evt, clobj_t _queue, clobj_t _mem, void *buf,
-                          const size_t *_buf_orig, size_t buf_orig_l,
-                          const size_t *_host_orig, size_t host_orig_l,
-                          const size_t *_reg, size_t reg_l,
-                          const size_t *_buf_pitches, size_t buf_pitches_l,
-                          const size_t *_host_pitches, size_t host_pitches_l,
-                          const clobj_t *_wait_for, uint32_t num_wait_for,
-                          int block, void *pyobj)
-{
-#if PYOPENCL_CL_VERSION >= 0x1010
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    auto queue = static_cast<command_queue*>(_queue);
-    auto mem = static_cast<memory_object*>(_mem);
-    ConstBuffer<size_t, 3> buf_orig(_buf_orig, buf_orig_l);
-    ConstBuffer<size_t, 3> host_orig(_host_orig, host_orig_l);
-    ConstBuffer<size_t, 3> reg(_reg, reg_l, 1);
-    ConstBuffer<size_t, 2> buf_pitches(_buf_pitches, buf_pitches_l);
-    ConstBuffer<size_t, 2> host_pitches(_host_pitches, host_pitches_l);
-    return c_handle_retry_mem_error([&] {
-            pyopencl_call_guarded(
-                clEnqueueWriteBufferRect, queue, mem, bool(block), buf_orig,
-                host_orig, reg, buf_pitches[0], buf_pitches[1], host_pitches[0],
-                host_pitches[1], buf, wait_for, nanny_event_out(evt, pyobj));
-        });
-#else
-    PYOPENCL_UNSUPPORTED(clEnqueueWriteBufferRect, "CL 1.0")
-#endif
-}
-
-error*
-enqueue_copy_buffer_rect(clobj_t *evt, clobj_t _queue, clobj_t _src,
-                         clobj_t _dst, const size_t *_src_orig,
-                         size_t src_orig_l, const size_t *_dst_orig,
-                         size_t dst_orig_l, const size_t *_reg, size_t reg_l,
-                         const size_t *_src_pitches, size_t src_pitches_l,
-                         const size_t *_dst_pitches, size_t dst_pitches_l,
-                         const clobj_t *_wait_for, uint32_t num_wait_for)
-{
-#if PYOPENCL_CL_VERSION >= 0x1010
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    auto queue = static_cast<command_queue*>(_queue);
-    auto src = static_cast<memory_object*>(_src);
-    auto dst = static_cast<memory_object*>(_dst);
-    ConstBuffer<size_t, 3> src_orig(_src_orig, src_orig_l);
-    ConstBuffer<size_t, 3> dst_orig(_dst_orig, dst_orig_l);
-    ConstBuffer<size_t, 3> reg(_reg, reg_l, 1);
-    ConstBuffer<size_t, 2> src_pitches(_src_pitches, src_pitches_l);
-    ConstBuffer<size_t, 2> dst_pitches(_dst_pitches, dst_pitches_l);
-    return c_handle_retry_mem_error([&] {
-            pyopencl_call_guarded(
-                clEnqueueCopyBufferRect, queue, src, dst, src_orig, dst_orig,
-                reg, src_pitches[0], src_pitches[1], dst_pitches[0],
-                dst_pitches[1], wait_for, event_out(evt));
-        });
-#else
-    PYOPENCL_UNSUPPORTED(clEnqueueCopyBufferRect, "CL 1.0")
-#endif
-}
-
-// }}}
-
-error*
-buffer__get_sub_region(clobj_t *_sub_buf, clobj_t _buf, size_t orig,
-                       size_t size, cl_mem_flags flags)
-{
-#if PYOPENCL_CL_VERSION >= 0x1010
-    auto buf = static_cast<buffer*>(_buf);
-    return c_handle_error([&] {
-            *_sub_buf = buf->get_sub_region(orig, size, flags);
-        });
-#else
-    PYOPENCL_UNSUPPORTED(clCreateSubBuffer, "CL 1.0")
-#endif
-}
diff --git a/src/c_wrapper/buffer.h b/src/c_wrapper/buffer.h
deleted file mode 100644
index c97a7919b56e5fda3bec2e739520f21991cbc544..0000000000000000000000000000000000000000
--- a/src/c_wrapper/buffer.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#include "memory_object.h"
-#include "clhelper.h"
-
-#ifndef __PYOPENCL_BUFFER_H
-#define __PYOPENCL_BUFFER_H
-
-// {{{ buffer
-
-class buffer : public memory_object {
-public:
-    PYOPENCL_DEF_CL_CLASS(BUFFER);
-    PYOPENCL_INLINE
-    buffer(cl_mem mem, bool retain)
-        : memory_object(mem, retain)
-    {}
-
-#if PYOPENCL_CL_VERSION >= 0x1010
-    PYOPENCL_USE_RESULT buffer *get_sub_region(size_t orig, size_t size,
-                                               cl_mem_flags flags) const;
-#endif
-};
-
-extern template void print_clobj<buffer>(std::ostream&, const buffer*);
-
-// }}}
-
-#endif
diff --git a/src/c_wrapper/clhelper.h b/src/c_wrapper/clhelper.h
deleted file mode 100644
index d0aff85c329ead7956ed2621fa4c00a1c887ab7c..0000000000000000000000000000000000000000
--- a/src/c_wrapper/clhelper.h
+++ /dev/null
@@ -1,254 +0,0 @@
-#include "error.h"
-#include "clobj.h"
-
-#ifndef __PYOPENCL_CLHELPER_H
-#define __PYOPENCL_CLHELPER_H
-
-template<typename CLObj, typename... T>
-class _CLObjOutArg : public OutArg {
-    typedef typename CLObj::cl_type CLType;
-    clobj_t *const m_ret;
-    CLType m_clobj;
-    cl_int (CL_API_CALL *m_release)(CLType);
-    const char *m_name;
-    std::tuple<T...> m_t1;
-    template<int... S>
-    PYOPENCL_INLINE CLObj*
-    __new_obj(seq<S...>)
-    {
-        return new CLObj(m_clobj, false, std::get<S>(m_t1)...);
-    }
-public:
-    PYOPENCL_INLINE
-    _CLObjOutArg(clobj_t *ret, cl_int (CL_API_CALL *release)(CLType),
-                 const char *name, T... t1) noexcept
-        : m_ret(ret), m_clobj(nullptr), m_release(release),
-          m_name(name), m_t1(t1...)
-    {
-    }
-    PYOPENCL_INLINE
-    _CLObjOutArg(_CLObjOutArg<CLObj, T...> &&other) noexcept
-        : m_ret(other.m_ret), m_clobj(other.m_clobj),
-          m_release(other.m_release), m_name(other.m_name)
-    {
-        std::swap(m_t1, other.m_t1);
-    }
-    PYOPENCL_INLINE typename CLObj::cl_type*
-    get()
-    {
-        return &m_clobj;
-    }
-    PYOPENCL_INLINE void
-    convert()
-    {
-        *m_ret = __new_obj(typename gens<sizeof...(T)>::type());
-    }
-    PYOPENCL_INLINE void
-    cleanup(bool converted)
-    {
-        if (converted) {
-            delete *m_ret;
-            *m_ret = nullptr;
-        } else {
-            call_guarded_cleanup(m_release, m_name, m_clobj);
-        }
-    }
-    PYOPENCL_INLINE void
-    print(std::ostream &stm, bool out=false) const
-    {
-        print_arg(stm, m_clobj, out);
-    }
-};
-
-template<typename CLObj, typename... T>
-static PYOPENCL_INLINE _CLObjOutArg<CLObj, T...>
-make_cloutarg(clobj_t *ret, cl_int (CL_API_CALL *release)(typename CLObj::cl_type),
-              const char *name, T... t1)
-{
-    return _CLObjOutArg<CLObj, T...>(ret, release, name, t1...);
-}
-#define pyopencl_outarg(type, ret, func, ...)               \
-    make_cloutarg<type>(ret, func, #func, ##__VA_ARGS__)
-
-// {{{ GetInfo helpers
-
-template<typename T, typename... ArgTypes, typename... ArgTypes2>
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE pyopencl_buf<T>
-get_vec_info(cl_int (CL_API_CALL *func)(ArgTypes...), const char *name,
-             ArgTypes2&&... args)
-{
-    size_t size = 0;
-    call_guarded(func, name, args..., 0, nullptr, buf_arg(size));
-    pyopencl_buf<T> buf(size / sizeof(T));
-    call_guarded(func, name, args..., size_arg(buf), buf_arg(size));
-    return buf;
-}
-#define pyopencl_get_vec_info(type, what, ...)                      \
-    get_vec_info<type>(clGet##what##Info, "clGet" #what "Info", __VA_ARGS__)
-
-inline generic_info make_generic_info(class_t opaque_class, const char *type, bool free_type, void *value, bool free_value)
-{
-  generic_info result;
-  result.opaque_class = opaque_class;
-  result.type = type;
-  result.free_type = free_type;
-  result.value = value;
-  result.free_value = free_value;
-  return result;
-}
-
-template<typename T>
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE generic_info
-convert_array_info(const char *tname, pyopencl_buf<T> &buf)
-{
-    return make_generic_info(
-        CLASS_NONE,
-        _copy_str(std::string(tname) + "[" + tostring(buf.len()) + "]"),
-        true,
-        buf.release(),
-        true);
-}
-
-template<typename T>
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE generic_info
-convert_array_info(const char *tname, pyopencl_buf<T> &&_buf)
-{
-    pyopencl_buf<T> &buf = _buf;
-    return convert_array_info<T>(tname, buf);
-}
-
-#define pyopencl_convert_array_info(type, buf)          \
-    convert_array_info<type>(#type, buf)
-#define pyopencl_get_array_info(type, what, ...)                    \
-    pyopencl_convert_array_info(type, pyopencl_get_vec_info(type, what, __VA_ARGS__))
-
-template<typename CLObj, typename T>
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE generic_info
-convert_opaque_array_info(T &&buf)
-{
-    return make_generic_info(
-        CLObj::class_id,
-        _copy_str(std::string("void*[") + tostring(buf.len()) + "]"),
-        true,
-        buf_to_base<CLObj>(std::forward<T>(buf)).release(),
-        true);
-}
-#define pyopencl_get_opaque_array_info(cls, what, ...)  \
-    convert_opaque_array_info<cls>(               \
-        pyopencl_get_vec_info(cls::cl_type, what, __VA_ARGS__))
-
-template<typename CLObj, typename... ArgTypes, typename... ArgTypes2>
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE generic_info
-get_opaque_info(cl_int (CL_API_CALL *func)(ArgTypes...), const char *name,
-                ArgTypes2&&... args)
-{
-    typename CLObj::cl_type param_value;
-    call_guarded(func, name, args..., size_arg(param_value), nullptr);
-    void *value;
-    if (param_value) {
-        value = (void*)(new CLObj(param_value, /*retain*/ true));
-    } else {
-        value = nullptr;
-    }
-    return make_generic_info(CLObj::class_id, "void *", false, value, true);
-}
-#define pyopencl_get_opaque_info(clobj, what, ...)              \
-    get_opaque_info<clobj>(clGet##what##Info,             \
-                                     "clGet" #what "Info", __VA_ARGS__)
-
-template<typename... ArgTypes, typename... ArgTypes2>
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE generic_info
-get_str_info(cl_int (CL_API_CALL *func)(ArgTypes...), const char *name,
-             ArgTypes2&&... args)
-{
-    size_t size;
-    call_guarded(func, name, args..., 0, nullptr, buf_arg(size));
-    pyopencl_buf<char> param_value(size);
-    call_guarded(func, name, args..., param_value, buf_arg(size));
-    return make_generic_info(CLASS_NONE, "char*", false, (void*)param_value.release(), true);
-}
-#define pyopencl_get_str_info(what, ...)                            \
-    get_str_info(clGet##what##Info, "clGet" #what "Info", __VA_ARGS__)
-
-template<typename T, typename... ArgTypes, typename... ArgTypes2>
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE generic_info
-get_int_info(cl_int (CL_API_CALL *func)(ArgTypes...), const char *name,
-             const char *tpname, ArgTypes2&&... args)
-{
-    T value;
-    call_guarded(func, name, args..., size_arg(value), nullptr);
-    return make_generic_info(CLASS_NONE, tpname, false, cl_memdup(&value), true);
-}
-#define pyopencl_get_int_info(type, what, ...)                      \
-    get_int_info<type>(clGet##what##Info, "clGet" #what "Info", \
-                                 #type "*", __VA_ARGS__)
-
-// }}}
-
-template<typename T, typename CLType, typename... ArgTypes>
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE T*
-convert_obj(cl_int (CL_API_CALL *clRelease)(CLType), const char *name, CLType cl_obj,
-            ArgTypes&&... args)
-{
-    try {
-        return new T(cl_obj, false, std::forward<ArgTypes>(args)...);
-    } catch (...) {
-        call_guarded_cleanup(clRelease, name, cl_obj);
-        throw;
-    }
-}
-#define pyopencl_convert_obj(type, func, ...)       \
-    convert_obj<type>(func, #func, __VA_ARGS__)
-
-// {{{ extension function pointers
-
-#if PYOPENCL_CL_VERSION >= 0x1020
-template<typename T>
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE T
-get_ext_fun(cl_platform_id plat, const char *name, const char *err)
-{
-    T func = (T)clGetExtensionFunctionAddressForPlatform(plat, name);
-    if (!func) {
-        throw clerror(name, CL_INVALID_VALUE, err);
-    }
-    return func;
-}
-#define pyopencl_get_ext_fun(plat, name)                                \
-    get_ext_fun<name##_fn>(plat, #name, #name " not available")
-#else
-template<typename T>
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE T
-get_ext_fun(const char *name, const char *err)
-{
-    T func = (T)clGetExtensionFunctionAddress(name);
-    if (!func) {
-        throw clerror(name, CL_INVALID_VALUE, err);
-    }
-    return func;
-}
-#define pyopencl_get_ext_fun(plat, name)                                \
-    get_ext_fun<name##_fn>(#name, #name " not available")
-#endif
-
-// }}}
-
-static PYOPENCL_INLINE std::ostream&
-operator<<(std::ostream &stm, const cl_image_format &fmt)
-{
-    stm << "channel_order: " << fmt.image_channel_order
-        << ",\nchannel_data_type: " << fmt.image_channel_data_type;
-    return stm;
-}
-
-#ifdef CL_DEVICE_TOPOLOGY_AMD
-static PYOPENCL_INLINE std::ostream&
-operator<<(std::ostream &stm, const cl_device_topology_amd &topol)
-{
-    stm << "pcie.bus: " << topol.pcie.bus
-        << ",\npcie.device: " << topol.pcie.device
-        << ",\npcie.function: " << topol.pcie.function
-        << ",\npcie.type: " << topol.pcie.type;
-    return stm;
-}
-#endif
-#endif
diff --git a/src/c_wrapper/clinfo_ext.h b/src/c_wrapper/clinfo_ext.h
deleted file mode 100644
index 43b7b6082fda28ad433f26c5d9a5e2e743e24940..0000000000000000000000000000000000000000
--- a/src/c_wrapper/clinfo_ext.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Include OpenCL header, and define OpenCL extensions, since what is and is not
- * available in the official headers is very system-dependent */
-
-#ifndef _EXT_H
-#define _EXT_H
-
-#if (defined(__APPLE__) && !defined(PYOPENCL_APPLE_USE_CL_H))
-#include <OpenCL/opencl.h>
-#else
-#include <CL/cl.h>
-#endif
-
-/* These two defines were introduced in the 1.2 headers
- * on 2012-11-30, so earlier versions don't have them
- * (e.g. Debian wheezy)
- */
-
-#ifndef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
-#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT                 0x104A
-#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT          0x104B
-#endif
-
-/*
- * Extensions
- */
-
-/* cl_khr_icd */
-#define CL_PLATFORM_ICD_SUFFIX_KHR			0x0920
-#define CL_PLATFORM_NOT_FOUND_KHR			-1001
-
-
-/* cl_khr_fp64 */
-#define CL_DEVICE_DOUBLE_FP_CONFIG			0x1032
-
-/* cl_khr_fp16 */
-#define CL_DEVICE_HALF_FP_CONFIG			0x1033
-
-/* cl_khr_terminate_context */
-#define CL_DEVICE_TERMINATE_CAPABILITY_KHR		0x200F
-
-/* cl_nv_device_attribute_query */
-#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV		0x4000
-#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV		0x4001
-#define CL_DEVICE_REGISTERS_PER_BLOCK_NV		0x4002
-#define CL_DEVICE_WARP_SIZE_NV				0x4003
-#define CL_DEVICE_GPU_OVERLAP_NV			0x4004
-#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV		0x4005
-#define CL_DEVICE_INTEGRATED_MEMORY_NV			0x4006
-#define CL_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT_NV	0x4007
-#define CL_DEVICE_PCI_BUS_ID_NV				0x4008
-#define CL_DEVICE_PCI_SLOT_ID_NV			0x4009
-
-/* cl_ext_atomic_counters_{32,64} */
-#define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT		0x4032
-
-/* cl_amd_device_attribute_query */
-#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD		0x4036
-#define CL_DEVICE_TOPOLOGY_AMD				0x4037
-#define CL_DEVICE_BOARD_NAME_AMD			0x4038
-#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD		0x4039
-#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD		0x4040
-#define CL_DEVICE_SIMD_WIDTH_AMD			0x4041
-#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD		0x4042
-#define CL_DEVICE_WAVEFRONT_WIDTH_AMD			0x4043
-#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD		0x4044
-#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD		0x4045
-#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD	0x4046
-#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD	0x4047
-#define CL_DEVICE_LOCAL_MEM_BANKS_AMD			0x4048
-#define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD		0x4049
-#define CL_DEVICE_GFXIP_MAJOR_AMD			0x404A
-#define CL_DEVICE_GFXIP_MINOR_AMD			0x404B
-#define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD		0x404C
-
-#ifndef CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD
-#define CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD		1
-
-typedef union
-{
-	struct { cl_uint type; cl_uint data[5]; } raw;
-	struct { cl_uint type; cl_char unused[17]; cl_char bus; cl_char device; cl_char function; } pcie;
-} cl_device_topology_amd;
-#endif
-
-/* cl_amd_offline_devices */
-#define CL_CONTEXT_OFFLINE_DEVICES_AMD			0x403F
-
-/* cl_ext_device_fission */
-#define cl_ext_device_fission				1
-
-typedef cl_ulong  cl_device_partition_property_ext;
-
-#define CL_DEVICE_PARTITION_EQUALLY_EXT			0x4050
-#define CL_DEVICE_PARTITION_BY_COUNTS_EXT		0x4051
-#define CL_DEVICE_PARTITION_BY_NAMES_EXT		0x4052
-#define CL_DEVICE_PARTITION_BY_NAMES_INTEL		0x4052 /* cl_intel_device_partition_by_names */
-#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT	0x4053
-
-#define CL_DEVICE_PARENT_DEVICE_EXT			0x4054
-#define CL_DEVICE_PARTITION_TYPES_EXT			0x4055
-#define CL_DEVICE_AFFINITY_DOMAINS_EXT			0x4056
-#define CL_DEVICE_REFERENCE_COUNT_EXT			0x4057
-#define CL_DEVICE_PARTITION_STYLE_EXT			0x4058
-
-#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT			0x1
-#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT			0x2
-#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT			0x3
-#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT			0x4
-#define CL_AFFINITY_DOMAIN_NUMA_EXT			0x10
-#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT		0x100
-
-/* cl_intel_advanced_motion_estimation */
-#define CL_DEVICE_ME_VERSION_INTEL			0x407E
-
-/* cl_qcom_ext_host_ptr */
-#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM		0x40A0
-#define CL_DEVICE_PAGE_SIZE_QCOM			0x40A1
-
-/* cl_khr_spir */
-#define CL_DEVICE_SPIR_VERSIONS				0x40E0
-
-/* cl_altera_device_temperature */
-#define CL_DEVICE_CORE_TEMPERATURE_ALTERA		0x40F3
-
-/* cl_intel_simultaneous_sharing */
-#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL		0x4104
-#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL	0x4105
-
-#endif
diff --git a/src/c_wrapper/clobj.h b/src/c_wrapper/clobj.h
deleted file mode 100644
index 5db08710f2b10193b5cc5e8528257317f666de32..0000000000000000000000000000000000000000
--- a/src/c_wrapper/clobj.h
+++ /dev/null
@@ -1,149 +0,0 @@
-#include "utils.h"
-
-#ifndef __PYOPENCL_CLOBJ_H
-#define __PYOPENCL_CLOBJ_H
-
-#define PYOPENCL_DEF_CL_CLASS(name)                     \
-    constexpr static class_t class_id = CLASS_##name;   \
-    constexpr static const char *class_name = #name;
-
-struct clbase {
-private:
-    // non-copyable
-    clbase(const clbase&) = delete;
-    clbase &operator=(const clbase&) = delete;
-    bool operator==(clbase const &other) const = delete;
-    bool operator!=(clbase const &other) const = delete;
-public:
-    clbase() = default;
-    virtual ~clbase() = default;
-    virtual intptr_t intptr() const = 0;
-    virtual generic_info get_info(cl_uint) const = 0;
-};
-
-template<typename CLType>
-class clobj : public clbase {
-private:
-    CLType m_obj;
-public:
-    typedef CLType cl_type;
-    PYOPENCL_INLINE
-    clobj(CLType obj, bool=false) : m_obj(obj)
-    {}
-    PYOPENCL_INLINE const CLType&
-    data() const
-    {
-        return m_obj;
-    }
-    intptr_t
-    intptr() const
-    {
-        return (intptr_t)m_obj;
-    }
-};
-
-template<typename CLObj>
-void
-print_clobj(std::ostream &stm, const CLObj *obj)
-{
-    stm << CLObj::class_name << "(" << (const void*)obj << ")<"
-        << (const void*)obj->data() << ">";
-}
-
-template<typename CLObj>
-class CLArg<CLObj, enable_if_t<std::is_base_of<clobj<typename CLObj::cl_type>,
-                                               CLObj>::value> > {
-private:
-    CLObj &m_obj;
-public:
-    CLArg(CLObj &obj) : m_obj(obj)
-    {
-    }
-    PYOPENCL_INLINE const typename CLObj::cl_type&
-    convert() const
-    {
-        return m_obj.data();
-    }
-    PYOPENCL_INLINE void
-    print(std::ostream &stm)
-    {
-        print_clobj(stm, &m_obj);
-    }
-};
-
-template<typename CLObj>
-class CLArg<CLObj*, enable_if_t<std::is_base_of<clobj<typename CLObj::cl_type>,
-                                                CLObj>::value> > {
-private:
-    CLObj *m_obj;
-public:
-    CLArg(CLObj *obj) : m_obj(obj)
-    {
-    }
-    PYOPENCL_INLINE const typename CLObj::cl_type&
-    convert() const
-    {
-        return m_obj->data();
-    }
-    PYOPENCL_INLINE void
-    print(std::ostream &stm)
-    {
-        print_clobj(stm, m_obj);
-    }
-};
-
-template<typename CLObj>
-static PYOPENCL_INLINE CLObj*
-clobj_from_int_ptr(intptr_t ptr, bool retain)
-{
-    return new CLObj(reinterpret_cast<typename CLObj::cl_type>(ptr), retain);
-}
-
-template<typename T, typename T2>
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE pyopencl_buf<typename T::cl_type>
-buf_from_class(T2 *buf2, size_t len)
-{
-    pyopencl_buf<typename T::cl_type> buf(len);
-    for (size_t i = 0;i < len;i++) {
-        buf[i] = static_cast<const T*>(buf2[i])->data();
-    }
-    return buf;
-}
-
-template<typename T, typename T2>
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE pyopencl_buf<typename T::cl_type>
-buf_from_class(T2 &&buf)
-{
-    return buf_from_class(buf.get(), buf.len());
-}
-
-template<typename T, typename T2, typename... ArgTypes>
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE pyopencl_buf<clbase*>
-buf_to_base(T2 *buf2, size_t len, ArgTypes&&... args)
-{
-    pyopencl_buf<clbase*> buf(len);
-    size_t i = 0;
-    try {
-        for (;i < len;i++) {
-            buf[i] = static_cast<clbase*>(
-                new T((typename T::cl_type)buf2[i],
-                      std::forward<ArgTypes>(args)...));
-        }
-    } catch (...) {
-        for (size_t j = 0;j < i;j++) {
-            delete buf[i];
-        }
-        throw;
-    }
-    return buf;
-}
-
-template<typename T, typename T2, typename... ArgTypes>
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE pyopencl_buf<clbase*>
-buf_to_base(T2 &&buf2, ArgTypes&&... args)
-{
-    return buf_to_base<T>(buf2.get(), buf2.len(),
-                           std::forward<ArgTypes>(args)...);
-}
-
-#endif
diff --git a/src/c_wrapper/command_queue.cpp b/src/c_wrapper/command_queue.cpp
deleted file mode 100644
index b8ecef1ee6b950b23888e37032caf632e3fe9bb4..0000000000000000000000000000000000000000
--- a/src/c_wrapper/command_queue.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-#include "command_queue.h"
-#include "device.h"
-#include "context.h"
-#include "event.h"
-#include "clhelper.h"
-
-template class clobj<cl_command_queue>;
-template void print_arg<cl_command_queue>(std::ostream&,
-                                          const cl_command_queue&, bool);
-template void print_clobj<command_queue>(std::ostream&, const command_queue*);
-template void print_buf<cl_command_queue>(
-    std::ostream&, const cl_command_queue*, size_t, ArgType, bool, bool);
-
-command_queue::~command_queue()
-{
-    pyopencl_call_guarded_cleanup(clReleaseCommandQueue, PYOPENCL_CL_CASTABLE_THIS);
-}
-
-generic_info
-command_queue::get_info(cl_uint param_name) const
-{
-    switch ((cl_command_queue_info)param_name) {
-    case CL_QUEUE_CONTEXT:
-        return pyopencl_get_opaque_info(context, CommandQueue,
-                                        PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_QUEUE_DEVICE:
-        return pyopencl_get_opaque_info(device, CommandQueue, PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_QUEUE_REFERENCE_COUNT:
-        return pyopencl_get_int_info(cl_uint, CommandQueue,
-                                     PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_QUEUE_PROPERTIES:
-        return pyopencl_get_int_info(cl_command_queue_properties,
-                                     CommandQueue, PYOPENCL_CL_CASTABLE_THIS, param_name);
-    default:
-        throw clerror("CommandQueue.get_info", CL_INVALID_VALUE);
-    }
-}
-
-// c wrapper
-
-// Command Queue
-error*
-create_command_queue(clobj_t *queue, clobj_t _ctx,
-                     clobj_t _dev, cl_command_queue_properties props)
-{
-    auto ctx = static_cast<context*>(_ctx);
-    auto py_dev = static_cast<device*>(_dev);
-    return c_handle_error([&] {
-            cl_device_id dev;
-            if (py_dev) {
-                dev = py_dev->data();
-            } else {
-                auto devs = pyopencl_get_vec_info(cl_device_id, Context,
-                                                  ctx, CL_CONTEXT_DEVICES);
-                if (devs.len() == 0) {
-                    throw clerror("CommandQueue", CL_INVALID_VALUE,
-                                  "context doesn't have any devices? -- "
-                                  "don't know which one to default to");
-                }
-                dev = devs[0];
-            }
-            cl_command_queue cl_queue =
-                pyopencl_call_guarded(clCreateCommandQueue, ctx, dev, props);
-            *queue = new command_queue(cl_queue, false);
-        });
-}
-
-error*
-command_queue__finish(clobj_t queue)
-{
-    return c_handle_error([&] {
-            pyopencl_call_guarded(clFinish, static_cast<command_queue*>(queue));
-        });
-}
-
-error*
-command_queue__flush(clobj_t queue)
-{
-    return c_handle_error([&] {
-            pyopencl_call_guarded(clFlush, static_cast<command_queue*>(queue));
-        });
-}
-
-error*
-enqueue_marker_with_wait_list(clobj_t *evt, clobj_t _queue,
-                              const clobj_t *_wait_for, uint32_t num_wait_for)
-{
-#if PYOPENCL_CL_VERSION >= 0x1020
-    auto queue = static_cast<command_queue*>(_queue);
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    return c_handle_error([&] {
-            pyopencl_call_guarded(clEnqueueMarkerWithWaitList, queue,
-                                  wait_for, event_out(evt));
-        });
-#else
-    PYOPENCL_UNSUPPORTED_BEFORE(clEnqueueMarkerWithWaitList, "CL 1.2")
-#endif
-}
-
-error*
-enqueue_barrier_with_wait_list(clobj_t *evt, clobj_t _queue,
-                               const clobj_t *_wait_for, uint32_t num_wait_for)
-{
-#if PYOPENCL_CL_VERSION >= 0x1020
-    auto queue = static_cast<command_queue*>(_queue);
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    return c_handle_error([&] {
-            pyopencl_call_guarded(clEnqueueBarrierWithWaitList, queue,
-                                  wait_for, event_out(evt));
-        });
-#else
-    PYOPENCL_UNSUPPORTED_BEFORE(clEnqueueBarrierWithWaitList, "CL 1.2")
-#endif
-}
-
-error*
-enqueue_marker(clobj_t *evt, clobj_t _queue)
-{
-    auto queue = static_cast<command_queue*>(_queue);
-    return c_handle_error([&] {
-            pyopencl_call_guarded(clEnqueueMarker, queue, event_out(evt));
-        });
-}
-
-error*
-enqueue_barrier(clobj_t _queue)
-{
-    auto queue = static_cast<command_queue*>(_queue);
-    return c_handle_error([&] {
-            pyopencl_call_guarded(clEnqueueBarrier, queue);
-        });
-}
diff --git a/src/c_wrapper/command_queue.h b/src/c_wrapper/command_queue.h
deleted file mode 100644
index 3a7c01710133f90c40e3afab58abc96f88277f86..0000000000000000000000000000000000000000
--- a/src/c_wrapper/command_queue.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#include "error.h"
-
-#ifndef __PYOPENCL_COMMAND_QUEUE_H
-#define __PYOPENCL_COMMAND_QUEUE_H
-
-// {{{ command_queue
-
-extern template class clobj<cl_command_queue>;
-extern template void print_arg<cl_command_queue>(
-    std::ostream&, const cl_command_queue&, bool);
-extern template void print_buf<cl_command_queue>(
-    std::ostream&, const cl_command_queue*, size_t, ArgType, bool, bool);
-
-class command_queue : public clobj<cl_command_queue> {
-public:
-    PYOPENCL_DEF_CL_CLASS(COMMAND_QUEUE);
-    PYOPENCL_INLINE
-    command_queue(cl_command_queue q, bool retain)
-        : clobj(q)
-    {
-        if (retain) {
-            pyopencl_call_guarded(clRetainCommandQueue, PYOPENCL_CL_CASTABLE_THIS);
-        }
-    }
-    PYOPENCL_INLINE
-    command_queue(const command_queue &queue)
-        : command_queue(queue.data(), true)
-    {}
-    ~command_queue();
-
-    generic_info get_info(cl_uint param_name) const;
-
-#if 0
-
-    PYOPENCL_USE_RESULT std::unique_ptr<context>
-    get_context() const
-    {
-        cl_context param_value;
-        pyopencl_call_guarded(clGetCommandQueueInfo, this, CL_QUEUE_CONTEXT,
-                              size_arg(param_value), nullptr);
-        return std::unique_ptr<context>(
-            new context(param_value, /*retain*/ true));
-    }
-
-#if PYOPENCL_CL_VERSION < 0x1010
-    cl_command_queue_properties
-    set_property(cl_command_queue_properties prop, bool enable) const
-    {
-        cl_command_queue_properties old_prop;
-        pyopencl_call_guarded(clSetCommandQueueProperty, this, prop,
-                              enable, buf_arg(old_prop));
-        return old_prop;
-    }
-#endif
-
-#endif
-};
-
-extern template void print_clobj<command_queue>(std::ostream&,
-                                                const command_queue*);
-
-// }}}
-
-#endif
diff --git a/src/c_wrapper/context.cpp b/src/c_wrapper/context.cpp
deleted file mode 100644
index 0fe48554f954e46dad0ef5561932a9cb9fdb75ff..0000000000000000000000000000000000000000
--- a/src/c_wrapper/context.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-#include "context.h"
-#include "device.h"
-#include "platform.h"
-#include "clhelper.h"
-
-template class clobj<cl_context>;
-template void print_arg<cl_context>(std::ostream&, const cl_context&, bool);
-template void print_clobj<context>(std::ostream&, const context*);
-template void print_buf<cl_context>(std::ostream&, const cl_context*,
-                                    size_t, ArgType, bool, bool);
-
-void
-context::get_version(cl_context ctx, int *major, int *minor)
-{
-    cl_device_id s_buff[16];
-    size_t size;
-    pyopencl_buf<cl_device_id> d_buff(0);
-    cl_device_id *devs = s_buff;
-    pyopencl_call_guarded(clGetContextInfo, ctx, CL_CONTEXT_DEVICES,
-                          0, nullptr, buf_arg(size));
-    if (PYOPENCL_UNLIKELY(!size)) {
-        throw clerror("Context.get_version", CL_INVALID_VALUE,
-                      "Cannot get devices from context.");
-    }
-    if (PYOPENCL_UNLIKELY(size > sizeof(s_buff))) {
-        d_buff.resize(size / sizeof(cl_device_id));
-        devs = d_buff.get();
-    }
-    pyopencl_call_guarded(clGetContextInfo, ctx, CL_CONTEXT_DEVICES,
-                          size_arg(devs, size), buf_arg(size));
-    device::get_version(devs[0], major, minor);
-}
-
-context::~context()
-{
-    pyopencl_call_guarded_cleanup(clReleaseContext, PYOPENCL_CL_CASTABLE_THIS);
-}
-
-generic_info
-context::get_info(cl_uint param_name) const
-{
-    switch ((cl_context_info)param_name) {
-    case CL_CONTEXT_REFERENCE_COUNT:
-        return pyopencl_get_int_info(cl_uint, Context,
-                                     PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_CONTEXT_DEVICES:
-        return pyopencl_get_opaque_array_info(device, Context,
-                                              PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_CONTEXT_PROPERTIES: {
-        auto result = pyopencl_get_vec_info(
-            cl_context_properties, Context, PYOPENCL_CL_CASTABLE_THIS, param_name);
-        pyopencl_buf<generic_info> py_result(result.len() / 2);
-        size_t i = 0;
-        for (;i < py_result.len();i++) {
-            cl_context_properties key = result[i * 2];
-            if (key == 0)
-                break;
-            cl_context_properties value = result[i * 2 + 1];
-            switch (key) {
-            case CL_CONTEXT_PLATFORM:
-              py_result[i] = make_generic_info(
-                  CLASS_PLATFORM,
-                  "void *", false,
-                  new platform(reinterpret_cast<cl_platform_id>(value)), true);
-                break;
-
-#if defined(PYOPENCL_GL_SHARING_VERSION) && (PYOPENCL_GL_SHARING_VERSION >= 1)
-#if defined(__APPLE__) && defined(HAVE_GL) && !defined(PYOPENCL_APPLE_USE_CL_H)
-            case CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE:
-#else
-            case CL_GL_CONTEXT_KHR:
-            case CL_EGL_DISPLAY_KHR:
-            case CL_GLX_DISPLAY_KHR:
-            case CL_WGL_HDC_KHR:
-            case CL_CGL_SHAREGROUP_KHR:
-#endif
-              py_result[i] = make_generic_info(
-                  CLASS_NONE,
-                  "intptr_t *", false,
-                  (void*)value,
-                  // we do not own this object
-                  false);
-              break;
-#endif
-            default:
-                throw clerror("Context.get_info", CL_INVALID_VALUE,
-                              "unknown context_property key encountered");
-            }
-        }
-        py_result.resize(i);
-        return pyopencl_convert_array_info(generic_info, py_result);
-    }
-
-#if PYOPENCL_CL_VERSION >= 0x1010
-    case CL_CONTEXT_NUM_DEVICES:
-        return pyopencl_get_int_info(cl_uint, Context,
-                                     PYOPENCL_CL_CASTABLE_THIS, param_name);
-#endif
-
-    default:
-        throw clerror("Context.get_info", CL_INVALID_VALUE);
-    }
-}
-
-// c wrapper
-
-// Context
-error*
-create_context(clobj_t *_ctx, const cl_context_properties *props,
-               cl_uint num_devices, const clobj_t *_devices)
-{
-    // TODO debug print properties
-    return c_handle_error([&] {
-            const auto devices = buf_from_class<device>(_devices, num_devices);
-            *_ctx = new context(
-                pyopencl_call_guarded(
-                    clCreateContext,
-                    const_cast<cl_context_properties*>(props),
-                    devices, nullptr, nullptr), false);
-        });
-}
-
-// Context
-error*
-create_context_from_type(clobj_t *_ctx, const cl_context_properties *props,
-                         cl_device_type dev_type)
-{
-    // TODO debug print properties
-    return c_handle_error([&] {
-            *_ctx = new context(
-                pyopencl_call_guarded(
-                    clCreateContextFromType,
-                    const_cast<cl_context_properties*>(props),
-                    dev_type, nullptr, nullptr), false);
-        });
-}
-
-error*
-context__get_supported_image_formats(clobj_t _ctx, cl_mem_flags flags,
-                                     cl_mem_object_type image_type,
-                                     generic_info *out)
-{
-    auto ctx = static_cast<context*>(_ctx);
-    return c_handle_error([&] {
-            cl_uint num;
-            pyopencl_call_guarded(clGetSupportedImageFormats, ctx, flags,
-                                  image_type, 0, nullptr, buf_arg(num));
-            pyopencl_buf<cl_image_format> formats(num);
-            pyopencl_call_guarded(clGetSupportedImageFormats, ctx, flags,
-                                  image_type, formats, buf_arg(num));
-            *out = pyopencl_convert_array_info(cl_image_format, formats);
-        });
-}
diff --git a/src/c_wrapper/context.h b/src/c_wrapper/context.h
deleted file mode 100644
index 1691035d09fb5628c0bbda967c205f30a4882100..0000000000000000000000000000000000000000
--- a/src/c_wrapper/context.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#include "error.h"
-
-#ifndef __PYOPENCL_CONTEXT_H
-#define __PYOPENCL_CONTEXT_H
-
-// {{{ context
-
-extern template class clobj<cl_context>;
-extern template void print_arg<cl_context>(std::ostream&,
-                                           const cl_context&, bool);
-extern template void print_buf<cl_context>(std::ostream&, const cl_context*,
-                                           size_t, ArgType, bool, bool);
-
-class context : public clobj<cl_context> {
-public:
-    PYOPENCL_DEF_CL_CLASS(CONTEXT);
-    static void get_version(cl_context ctx, int *major, int *minor);
-    PYOPENCL_INLINE
-    context(cl_context ctx, bool retain)
-        : clobj(ctx)
-    {
-        if (retain) {
-            pyopencl_call_guarded(clRetainContext, PYOPENCL_CL_CASTABLE_THIS);
-        }
-    }
-    ~context();
-    generic_info get_info(cl_uint param_name) const;
-};
-
-extern template void print_clobj<context>(std::ostream&, const context*);
-
-// }}}
-
-#endif
diff --git a/src/c_wrapper/debug.cpp b/src/c_wrapper/debug.cpp
deleted file mode 100644
index a118b4687148ad8fd4cdc846cbeb34de0add14d7..0000000000000000000000000000000000000000
--- a/src/c_wrapper/debug.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-#include "debug.h"
-#include <iostream>
-#include <ios>
-#include <iomanip>
-#include <stdlib.h>
-
-std::mutex dbg_lock;
-
-void
-dbg_print_str(std::ostream &stm, const char *str, size_t len)
-{
-    stm << '"';
-    for (size_t i = 0;i < len;i++) {
-        char escaped = 0;
-#define escape_char(in, out)                    \
-        case in:                                \
-            escaped = out;                      \
-            break
-        switch (str[i]) {
-            escape_char('\'', '\'');
-            escape_char('\"', '\"');
-            escape_char('\?', '\?');
-            escape_char('\\', '\\');
-            escape_char('\0', '0');
-            escape_char('\a', 'a');
-            escape_char('\b', 'b');
-            escape_char('\f', 'f');
-            escape_char('\r', 'r');
-            escape_char('\v', 'v');
-        default:
-            break;
-        }
-        if (escaped) {
-            stm << '\\' << escaped;
-        } else {
-            stm << str[i];
-        }
-    }
-    stm << '"';
-}
-
-void
-dbg_print_bytes(std::ostream &stm, const unsigned char *bytes, size_t len)
-{
-    stm << '"';
-    for (size_t i = 0;i < len;i++) {
-        stm << "\\x" << std::hex << std::setfill('0')
-            << std::setw(2) << bytes[i];
-    }
-    stm << std::dec << '"';
-}
-
-static PYOPENCL_INLINE bool
-_get_debug_env()
-{
-    const char *env = getenv("PYOPENCL_DEBUG");
-    const bool default_debug = DEFAULT_DEBUG;
-    if (!env) {
-        return default_debug;
-    }
-    if (strcasecmp(env, "0") == 0 || strcasecmp(env, "f") == 0 ||
-        strcasecmp(env, "false") == 0 || strcasecmp(env, "off") == 0) {
-        return false;
-    }
-    if (strcasecmp(env, "1") == 0 || strcasecmp(env, "t") == 0 ||
-        strcasecmp(env, "true") == 0 || strcasecmp(env, "on") == 0) {
-        return true;
-    }
-    return default_debug;
-}
-
-bool debug_enabled = _get_debug_env();
-
-int
-get_debug()
-{
-    return (int) debug_enabled;
-}
-
-void
-set_debug(int debug)
-{
-    debug_enabled = (bool)debug;
-}
diff --git a/src/c_wrapper/debug.h b/src/c_wrapper/debug.h
deleted file mode 100644
index f0700030fa9ea0255aba8d37cd5368cb7120aa93..0000000000000000000000000000000000000000
--- a/src/c_wrapper/debug.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#include "wrap_cl.h"
-#include "function.h"
-#include <string.h>
-#include <mutex>
-
-#ifdef __MINGW32__
-#include "mingw-std-threads/mingw.mutex.h"
-#include "mingw-std-threads/mingw.thread.h"
-#endif
-
-#ifndef __PYOPENCL_DEBUG_H
-#define __PYOPENCL_DEBUG_H
-
-extern bool debug_enabled;
-#ifdef PYOPENCL_TRACE
-#define DEFAULT_DEBUG true
-#else
-#define DEFAULT_DEBUG false
-#endif
-
-#define DEBUG_ON (PYOPENCL_EXPECT(debug_enabled, DEFAULT_DEBUG))
-
-extern std::mutex dbg_lock;
-
-void dbg_print_str(std::ostream&, const char*, size_t);
-static PYOPENCL_INLINE void
-dbg_print_str(std::ostream &stm, const char *str)
-{
-    return dbg_print_str(stm, str, strlen(str));
-}
-void dbg_print_bytes(std::ostream &stm, const unsigned char *bytes, size_t len);
-
-#endif
diff --git a/src/c_wrapper/device.cpp b/src/c_wrapper/device.cpp
deleted file mode 100644
index 16edaf34c7be2934e6350855a1788bf1311a6641..0000000000000000000000000000000000000000
--- a/src/c_wrapper/device.cpp
+++ /dev/null
@@ -1,375 +0,0 @@
-#include "device.h"
-#include "platform.h"
-
-template class clobj<cl_device_id>;
-template void print_arg<cl_device_id>(std::ostream&,
-                                      const cl_device_id&, bool);
-template void print_clobj<device>(std::ostream&, const device*);
-template void print_buf<cl_device_id>(std::ostream&, const cl_device_id*,
-                                      size_t, ArgType, bool, bool);
-
-void
-device::get_version(cl_device_id dev, int *major, int *minor)
-{
-    cl_platform_id plat;
-    pyopencl_call_guarded(clGetDeviceInfo, dev, CL_DEVICE_PLATFORM,
-                          size_arg(plat), nullptr);
-    platform::get_version(plat, major, minor);
-}
-
-device::~device()
-{
-    if (false) {
-    }
-#if PYOPENCL_CL_VERSION >= 0x1020
-    else if (m_ref_type == REF_CL_1_2) {
-        pyopencl_call_guarded_cleanup(clReleaseDevice, PYOPENCL_CL_CASTABLE_THIS);
-    }
-#endif
-}
-
-#ifdef CL_DEVICE_TOPOLOGY_AMD
-template<typename... ArgTypes>
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE generic_info
-get_device_topology_amd(ArgTypes&&... args)
-{
-    const char * tpname = "cl_device_topology_amd*";
-    cl_device_topology_amd value;
-    const char * fname = "clGetDeviceInfo";
-    call_guarded(clGetDeviceInfo, fname, args..., size_arg(value), nullptr);
-    return make_generic_info(CLASS_NONE, tpname, false, cl_memdup(&value), true);
-}
-
-#define pyopencl_get_device_topology_amd(...) get_device_topology_amd(__VA_ARGS__)
-
-#endif
-
-generic_info
-device::get_info(cl_uint param_name) const
-{
-#define DEV_GET_INT_INF(TYPE)                                   \
-    pyopencl_get_int_info(TYPE, Device, PYOPENCL_CL_CASTABLE_THIS, param_name)
-
-    switch ((cl_device_info)param_name) {
-    case CL_DEVICE_TYPE:
-        return DEV_GET_INT_INF(cl_device_type);
-    case CL_DEVICE_MAX_WORK_GROUP_SIZE:
-        return DEV_GET_INT_INF(size_t);
-    case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:
-    case CL_DEVICE_MAX_COMPUTE_UNITS:
-    case CL_DEVICE_VENDOR_ID:
-        return DEV_GET_INT_INF(cl_uint);
-
-    case CL_DEVICE_MAX_WORK_ITEM_SIZES:
-        return pyopencl_get_array_info(size_t, Device, PYOPENCL_CL_CASTABLE_THIS, param_name);
-
-    case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR:
-    case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT:
-    case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT:
-    case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG:
-    case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT:
-    case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE:
-
-    case CL_DEVICE_MAX_CLOCK_FREQUENCY:
-    case CL_DEVICE_ADDRESS_BITS:
-    case CL_DEVICE_MAX_READ_IMAGE_ARGS:
-    case CL_DEVICE_MAX_WRITE_IMAGE_ARGS:
-    case CL_DEVICE_MAX_SAMPLERS:
-    case CL_DEVICE_MEM_BASE_ADDR_ALIGN:
-    case CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE:
-        return DEV_GET_INT_INF(cl_uint);
-
-    case CL_DEVICE_MAX_MEM_ALLOC_SIZE:
-        return DEV_GET_INT_INF(cl_ulong);
-
-    case CL_DEVICE_IMAGE2D_MAX_WIDTH:
-    case CL_DEVICE_IMAGE2D_MAX_HEIGHT:
-    case CL_DEVICE_IMAGE3D_MAX_WIDTH:
-    case CL_DEVICE_IMAGE3D_MAX_HEIGHT:
-    case CL_DEVICE_IMAGE3D_MAX_DEPTH:
-    case CL_DEVICE_MAX_PARAMETER_SIZE:
-        return DEV_GET_INT_INF(size_t);
-
-    case CL_DEVICE_IMAGE_SUPPORT:
-        return DEV_GET_INT_INF(cl_bool);
-#ifdef CL_DEVICE_DOUBLE_FP_CONFIG
-    case CL_DEVICE_DOUBLE_FP_CONFIG:
-#endif
-#ifdef CL_DEVICE_HALF_FP_CONFIG
-    case CL_DEVICE_HALF_FP_CONFIG:
-#endif
-    case CL_DEVICE_SINGLE_FP_CONFIG:
-        return DEV_GET_INT_INF(cl_device_fp_config);
-
-    case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE:
-        return DEV_GET_INT_INF(cl_device_mem_cache_type);
-    case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE:
-        return DEV_GET_INT_INF(cl_uint);
-    case CL_DEVICE_GLOBAL_MEM_CACHE_SIZE:
-    case CL_DEVICE_GLOBAL_MEM_SIZE:
-    case CL_DEVICE_LOCAL_MEM_SIZE:
-    case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:
-        return DEV_GET_INT_INF(cl_ulong);
-
-    case CL_DEVICE_MAX_CONSTANT_ARGS:
-        return DEV_GET_INT_INF(cl_uint);
-    case CL_DEVICE_LOCAL_MEM_TYPE:
-        return DEV_GET_INT_INF(cl_device_local_mem_type);
-    case CL_DEVICE_PROFILING_TIMER_RESOLUTION:
-        return DEV_GET_INT_INF(size_t);
-    case CL_DEVICE_ENDIAN_LITTLE:
-    case CL_DEVICE_AVAILABLE:
-    case CL_DEVICE_COMPILER_AVAILABLE:
-    case CL_DEVICE_ERROR_CORRECTION_SUPPORT:
-        return DEV_GET_INT_INF(cl_bool);
-    case CL_DEVICE_EXECUTION_CAPABILITIES:
-        return DEV_GET_INT_INF(cl_device_exec_capabilities);
-    case CL_DEVICE_QUEUE_PROPERTIES:
-    // same as CL_DEVICE_QUEUE_ON_HOST_PROPERTIES in 2.0
-        return DEV_GET_INT_INF(cl_command_queue_properties);
-
-    case CL_DEVICE_NAME:
-    case CL_DEVICE_VENDOR:
-    case CL_DRIVER_VERSION:
-    case CL_DEVICE_PROFILE:
-    case CL_DEVICE_VERSION:
-    case CL_DEVICE_EXTENSIONS:
-        return pyopencl_get_str_info(Device, PYOPENCL_CL_CASTABLE_THIS, param_name);
-
-    case CL_DEVICE_PLATFORM:
-        return pyopencl_get_opaque_info(platform, Device, PYOPENCL_CL_CASTABLE_THIS, param_name);
-#if PYOPENCL_CL_VERSION >= 0x1010
-    case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF:
-    case CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR:
-    case CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT:
-    case CL_DEVICE_NATIVE_VECTOR_WIDTH_INT:
-    case CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG:
-    case CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT:
-    case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE:
-    case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF:
-        return DEV_GET_INT_INF(cl_uint);
-
-    case CL_DEVICE_HOST_UNIFIED_MEMORY: // deprecated in 2.0
-        return DEV_GET_INT_INF(cl_bool);
-    case CL_DEVICE_OPENCL_C_VERSION:
-        return pyopencl_get_str_info(Device, PYOPENCL_CL_CASTABLE_THIS, param_name);
-#endif
-#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
-    case CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV:
-    case CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV:
-    case CL_DEVICE_REGISTERS_PER_BLOCK_NV:
-    case CL_DEVICE_WARP_SIZE_NV:
-#ifdef CL_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT_NV
-    case CL_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT_NV:
-#endif
-#ifdef CL_DEVICE_PCI_BUS_ID_NV
-    case CL_DEVICE_PCI_BUS_ID_NV:
-#endif
-#ifdef CL_DEVICE_PCI_SLOT_ID_NV
-    case CL_DEVICE_PCI_SLOT_ID_NV:
-#endif
-        return DEV_GET_INT_INF(cl_uint);
-    case CL_DEVICE_GPU_OVERLAP_NV:
-    case CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV:
-    case CL_DEVICE_INTEGRATED_MEMORY_NV:
-        return DEV_GET_INT_INF(cl_bool);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x1020
-    case CL_DEVICE_LINKER_AVAILABLE:
-        return DEV_GET_INT_INF(cl_bool);
-    case CL_DEVICE_BUILT_IN_KERNELS:
-        return pyopencl_get_str_info(Device, PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_DEVICE_IMAGE_MAX_BUFFER_SIZE:
-    case CL_DEVICE_IMAGE_MAX_ARRAY_SIZE:
-        return DEV_GET_INT_INF(size_t);
-    case CL_DEVICE_PARENT_DEVICE:
-        return pyopencl_get_opaque_info(device, Device, PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_DEVICE_PARTITION_MAX_SUB_DEVICES:
-        return DEV_GET_INT_INF(cl_uint);
-    case CL_DEVICE_PARTITION_TYPE:
-    case CL_DEVICE_PARTITION_PROPERTIES:
-        return pyopencl_get_array_info(cl_device_partition_property,
-                                       Device, PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_DEVICE_PARTITION_AFFINITY_DOMAIN:
-        return pyopencl_get_array_info(cl_device_affinity_domain,
-                                       Device, PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_DEVICE_REFERENCE_COUNT:
-        return DEV_GET_INT_INF(cl_uint);
-    case CL_DEVICE_PREFERRED_INTEROP_USER_SYNC:
-    case CL_DEVICE_PRINTF_BUFFER_SIZE:
-        return DEV_GET_INT_INF(cl_bool);
-#endif
-#ifdef cl_khr_image2d_from_buffer
-    case CL_DEVICE_IMAGE_PITCH_ALIGNMENT:
-        return DEV_GET_INT_INF(cl_uint);
-    case CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT:
-        return DEV_GET_INT_INF(cl_uint);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x2000
-    case CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS:
-        return DEV_GET_INT_INF(cl_uint);
-    case CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE:
-        return DEV_GET_INT_INF(size_t);
-    case CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES:
-        return DEV_GET_INT_INF(cl_command_queue_properties);
-    case CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE:
-        return DEV_GET_INT_INF(cl_uint);
-    case CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE:
-        return DEV_GET_INT_INF(cl_uint);
-    case CL_DEVICE_MAX_ON_DEVICE_QUEUES:
-        return DEV_GET_INT_INF(cl_uint);
-    case CL_DEVICE_MAX_ON_DEVICE_EVENTS:
-        return DEV_GET_INT_INF(cl_uint);
-    case CL_DEVICE_SVM_CAPABILITIES:
-        return DEV_GET_INT_INF(cl_device_svm_capabilities);
-    case CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE:
-        return DEV_GET_INT_INF(size_t);
-    case CL_DEVICE_MAX_PIPE_ARGS:
-    case CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS:
-    case CL_DEVICE_PIPE_MAX_PACKET_SIZE:
-        return DEV_GET_INT_INF(cl_uint);
-    case CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT:
-    case CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT:
-    case CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT:
-        return DEV_GET_INT_INF(cl_uint);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x2010
-    case CL_DEVICE_IL_VERSION:
-        return pyopencl_get_str_info(Device, PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_DEVICE_MAX_NUM_SUB_GROUPS:
-        return DEV_GET_INT_INF(cl_uint);
-    case CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS:
-        return DEV_GET_INT_INF(cl_bool);
-#endif
-
-
-        // {{{ AMD dev attrs
-        //
-        // types of AMD dev attrs divined from
-        // https://www.khronos.org/registry/cl/api/1.2/cl.hpp
-#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
-    case CL_DEVICE_PROFILING_TIMER_OFFSET_AMD:
-        return DEV_GET_INT_INF(cl_ulong);
-#endif
-#ifdef CL_DEVICE_TOPOLOGY_AMD
-        case CL_DEVICE_TOPOLOGY_AMD:
-            return pyopencl_get_device_topology_amd(PYOPENCL_CL_CASTABLE_THIS, param_name);
-#endif
-#ifdef CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD
-    case CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD:
-        return DEV_GET_INT_INF(cl_bool);
-#endif
-#ifdef CL_DEVICE_BOARD_NAME_AMD
-    case CL_DEVICE_BOARD_NAME_AMD: ;
-        return pyopencl_get_str_info(Device, PYOPENCL_CL_CASTABLE_THIS, param_name);
-#endif
-#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
-    case CL_DEVICE_GLOBAL_FREE_MEMORY_AMD:
-        return pyopencl_get_array_info(size_t, Device,
-                                       PYOPENCL_CL_CASTABLE_THIS, param_name);
-#endif
-#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
-    case CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD:
-#endif
-#ifdef CL_DEVICE_SIMD_WIDTH_AMD
-    case CL_DEVICE_SIMD_WIDTH_AMD:
-#endif
-#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
-    case CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD:
-#endif
-#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
-    case CL_DEVICE_WAVEFRONT_WIDTH_AMD:
-#endif
-#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
-    case CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD:
-#endif
-#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
-    case CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD:
-#endif
-#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
-    case CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD:
-#endif
-#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
-    case CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD:
-#endif
-#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
-    case CL_DEVICE_LOCAL_MEM_BANKS_AMD:
-#endif
-#ifdef CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT
-    case CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT:
-#endif
-#ifdef CL_DEVICE_GFXIP_MAJOR_AMD
-    case CL_DEVICE_GFXIP_MAJOR_AMD:
-#endif
-#ifdef CL_DEVICE_GFXIP_MINOR_AMD
-    case CL_DEVICE_GFXIP_MINOR_AMD:
-#endif
-#ifdef CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD
-    case CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD:
-#endif
-        return DEV_GET_INT_INF(cl_uint);
-        // }}}
-#ifdef CL_DEVICE_ME_VERSION_INTEL
-    case CL_DEVICE_ME_VERSION_INTEL:
-#endif
-#ifdef CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM
-    case CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM:
-#endif
-#ifdef CL_DEVICE_PAGE_SIZE_QCOM
-    case CL_DEVICE_PAGE_SIZE_QCOM:
-#endif
-#ifdef CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL
-    case CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL:
-#endif
-        return DEV_GET_INT_INF(cl_uint);
-#ifdef CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL
-    case CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL:
-        return pyopencl_get_array_info(cl_uint, Device, PYOPENCL_CL_CASTABLE_THIS, param_name);
-#endif
-#ifdef CL_DEVICE_SPIR_VERSIONS
-    case CL_DEVICE_SPIR_VERSIONS:
-        return pyopencl_get_str_info(Device, PYOPENCL_CL_CASTABLE_THIS, param_name);
-#endif
-#ifdef CL_DEVICE_CORE_TEMPERATURE_ALTERA
-    case CL_DEVICE_CORE_TEMPERATURE_ALTERA:
-        return DEV_GET_INT_INF(cl_int);
-#endif
-
-    default:
-        throw clerror("Device.get_info", CL_INVALID_VALUE);
-    }
-}
-
-#if PYOPENCL_CL_VERSION >= 0x1020
-PYOPENCL_USE_RESULT pyopencl_buf<clobj_t>
-device::create_sub_devices(const cl_device_partition_property *props)
-{
-    // TODO debug print props
-    cl_uint num_devices;
-    pyopencl_call_guarded(clCreateSubDevices, PYOPENCL_CL_CASTABLE_THIS, props, 0, nullptr,
-                          buf_arg(num_devices));
-    pyopencl_buf<cl_device_id> devices(num_devices);
-    pyopencl_call_guarded(clCreateSubDevices, PYOPENCL_CL_CASTABLE_THIS, props, devices,
-                          buf_arg(num_devices));
-    return buf_to_base<device>(devices, true, device::REF_CL_1_2);
-}
-#endif
-
-// c wrapper
-
-error*
-device__create_sub_devices(clobj_t _dev, clobj_t **_devs,
-                           uint32_t *num_devices,
-                           const cl_device_partition_property *props)
-{
-#if PYOPENCL_CL_VERSION >= 0x1020
-    auto dev = static_cast<device*>(_dev);
-    return c_handle_error([&] {
-            auto devs = dev->create_sub_devices(props);
-            *num_devices = (uint32_t)devs.len();
-            *_devs = devs.release();
-        });
-#else
-    PYOPENCL_UNSUPPORTED_BEFORE(clCreateSubDevices, "CL 1.2")
-#endif
-}
diff --git a/src/c_wrapper/device.h b/src/c_wrapper/device.h
deleted file mode 100644
index a14a946804f0c116a683548068960d2a19db3df2..0000000000000000000000000000000000000000
--- a/src/c_wrapper/device.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#include "clhelper.h"
-
-#ifndef __PYOPENCL_DEVICE_H
-#define __PYOPENCL_DEVICE_H
-
-// {{{ device
-
-extern template class clobj<cl_device_id>;
-extern template void print_arg<cl_device_id>(std::ostream&,
-                                             const cl_device_id&, bool);
-extern template void print_buf<cl_device_id>(std::ostream&, const cl_device_id*,
-                                             size_t, ArgType, bool, bool);
-
-class device : public clobj<cl_device_id> {
-public:
-    PYOPENCL_DEF_CL_CLASS(DEVICE);
-    enum reference_type_t {
-        REF_NOT_OWNABLE,
-        REF_CL_1_2,
-    };
-
-private:
-    reference_type_t m_ref_type;
-
-public:
-    static void get_version(cl_device_id dev, int *major, int *minor);
-    device(cl_device_id did, bool retain=false,
-           reference_type_t ref_type=REF_NOT_OWNABLE)
-        : clobj(did), m_ref_type(ref_type)
-    {
-        if (retain && ref_type != REF_NOT_OWNABLE) {
-            if (false) {
-            }
-#if PYOPENCL_CL_VERSION >= 0x1020
-            else if (ref_type == REF_CL_1_2) {
-                pyopencl_call_guarded(clRetainDevice, PYOPENCL_CL_CASTABLE_THIS);
-            }
-#endif
-
-            else {
-                throw clerror("Device", CL_INVALID_VALUE,
-                              "cannot own references to devices when device "
-                              "fission or CL 1.2 is not available");
-            }
-        }
-    }
-
-    ~device();
-
-    generic_info get_info(cl_uint param_name) const;
-#if PYOPENCL_CL_VERSION >= 0x1020
-    PYOPENCL_USE_RESULT pyopencl_buf<clobj_t>
-    create_sub_devices(const cl_device_partition_property *props);
-#endif
-};
-
-extern template void print_clobj<device>(std::ostream&, const device*);
-
-// }}}
-
-#endif
diff --git a/src/c_wrapper/error.h b/src/c_wrapper/error.h
deleted file mode 100644
index 30e985f93e7dfef3f5dafb702275e064e0b509aa..0000000000000000000000000000000000000000
--- a/src/c_wrapper/error.h
+++ /dev/null
@@ -1,337 +0,0 @@
-#include "wrap_cl.h"
-#include "pyhelper.h"
-#include "clobj.h"
-
-#include <string.h>
-#include <stdexcept>
-#include <iostream>
-#include <utility>
-#include <functional>
-#include <atomic>
-
-#ifndef __PYOPENCL_ERROR_H
-#define __PYOPENCL_ERROR_H
-
-// {{{ error
-
-// See https://github.com/inducer/pyopencl/pull/83
-#if GCC_VERSION > 50200
-#define PYOPENCL_CL_CASTABLE_THIS this
-#else
-#define PYOPENCL_CL_CASTABLE_THIS data()
-#endif
-
-// discouraged, assumes 'version linearity', use PYOPENCL_UNSUPPORTED_BEFORE
-#define PYOPENCL_UNSUPPORTED(ROUTINE, VERSION) \
-    auto err = (error*)malloc(sizeof(error)); \
-    err->routine = strdup(#ROUTINE); \
-    err->msg = strdup("unsupported in " VERSION); \
-    err->code = CL_INVALID_VALUE; \
-    err->other = 0; \
-    return err;
-
-#define PYOPENCL_UNSUPPORTED_BEFORE(ROUTINE, VERSION) \
-    auto err = (error*)malloc(sizeof(error)); \
-    err->routine = strdup(#ROUTINE); \
-    err->msg = strdup("unsupported before " VERSION); \
-    err->code = CL_INVALID_VALUE; \
-    err->other = 0; \
-    return err;
-
-class clerror : public std::runtime_error {
-private:
-    const char *m_routine;
-    cl_int m_code;
-
-public:
-    clerror(const char *rout, cl_int c, const char *msg="")
-        : std::runtime_error(msg), m_routine(rout), m_code(c)
-    {
-        if (DEBUG_ON) {
-            std::lock_guard<std::mutex> lock(dbg_lock);
-            std::cerr << rout << ";" << msg<< ";" << c << std::endl;
-        }
-    }
-    PYOPENCL_INLINE const char*
-    routine() const
-    {
-        return m_routine;
-    }
-
-    PYOPENCL_INLINE cl_int
-    code() const
-    {
-        return m_code;
-    }
-
-    PYOPENCL_INLINE bool
-    is_out_of_memory() const
-    {
-        // matches Python implementation in pyopencl/cffi_cl.py
-        return (code() == CL_MEM_OBJECT_ALLOCATION_FAILURE ||
-                code() == CL_OUT_OF_RESOURCES ||
-                code() == CL_OUT_OF_HOST_MEMORY);
-    }
-};
-
-// }}}
-
-// {{{ tracing and error reporting
-
-template<typename>
-struct __CLArgGetter {
-    template<typename T>
-    static PYOPENCL_INLINE auto
-    get(T&& clarg) -> decltype(clarg.convert())
-    {
-        return clarg.convert();
-    }
-};
-
-template<typename T, class = void>
-struct __CLFinish {
-    static PYOPENCL_INLINE void
-    call(T, bool)
-    {
-    }
-};
-
-template<typename T>
-struct __CLFinish<T, decltype((void)(std::declval<T>().finish(true)))> {
-    static PYOPENCL_INLINE void
-    call(T v, bool converted)
-    {
-        v.finish(converted);
-    }
-};
-
-template<typename T, class = void>
-struct __CLPost {
-    static PYOPENCL_INLINE void
-    call(T)
-    {
-    }
-};
-
-template<typename T>
-struct __CLPost<T, decltype((void)(std::declval<T>().post()))> {
-    static PYOPENCL_INLINE void
-    call(T v)
-    {
-        v.post();
-    }
-};
-
-template<typename T, class = void>
-struct is_out_arg : std::false_type {};
-
-template<typename T>
-struct is_out_arg<T, enable_if_t<rm_ref_t<T>::is_out> > : std::true_type {};
-
-template<typename T, class = void>
-struct __CLPrintOut {
-    static PYOPENCL_INLINE void
-    call(T, std::ostream&)
-    {
-    }
-};
-
-template<typename T>
-struct __CLPrintOut<T, enable_if_t<is_out_arg<T>::value> > {
-    static inline void
-    call(T v, std::ostream &stm)
-    {
-        stm << ", ";
-        v.print(stm, true);
-    }
-};
-
-template<typename T, class = void>
-struct __CLPrint {
-    static inline void
-    call(T v, std::ostream &stm, bool &&first)
-    {
-        if (!first) {
-            stm << ", ";
-        } else {
-            first = false;
-        }
-        if (is_out_arg<T>::value) {
-            stm << "{out}";
-        }
-        v.print(stm);
-    }
-};
-
-template<template<typename...> class Caller, size_t n, typename T>
-struct __CLCall {
-    template<typename... Ts>
-    static PYOPENCL_INLINE void
-    call(T &&t, Ts&&... ts)
-    {
-        __CLCall<Caller, n - 1, T>::call(std::forward<T>(t),
-                                         std::forward<Ts>(ts)...);
-        Caller<decltype(std::get<n>(t))>::call(std::get<n>(t),
-                                               std::forward<Ts>(ts)...);
-    }
-};
-
-template<template<typename...> class Caller, typename T>
-struct __CLCall<Caller, 0, T> {
-    template<typename... Ts>
-    static PYOPENCL_INLINE void
-    call(T &&t, Ts&&... ts)
-    {
-        Caller<decltype(std::get<0>(t))>::call(std::get<0>(t),
-                                               std::forward<Ts>(ts)...);
-    }
-};
-
-template<typename... Types>
-class CLArgPack : public ArgPack<CLArg, Types...> {
-    template<typename T> void
-    _print_trace(T &res, const char *name)
-    {
-        typename CLArgPack::tuple_base *that = this;
-        std::cerr << name << "(";
-        __CLCall<__CLPrint, sizeof...(Types) - 1,
-                 decltype(*that)>::call(*that, std::cerr, true);
-        std::cerr << ") = (ret: " << res;
-        __CLCall<__CLPrintOut, sizeof...(Types) - 1,
-                 decltype(*that)>::call(*that, std::cerr);
-        std::cerr << ")" << std::endl;
-    }
-public:
-    using ArgPack<CLArg, Types...>::ArgPack;
-    template<typename Func>
-    PYOPENCL_INLINE auto
-    clcall(Func func, const char *name)
-        -> decltype(this->template call<__CLArgGetter>(func))
-    {
-        auto res = this->template call<__CLArgGetter>(func);
-        if (DEBUG_ON) {
-            std::lock_guard<std::mutex> lock(dbg_lock);
-            _print_trace(res, name);
-        }
-        return res;
-    }
-    PYOPENCL_INLINE void
-    finish()
-    {
-        typename CLArgPack::tuple_base *that = this;
-        __CLCall<__CLFinish, sizeof...(Types) - 1,
-                 decltype(*that)>::call(*that, false);
-        __CLCall<__CLPost, sizeof...(Types) - 1,
-                 decltype(*that)>::call(*that);
-        __CLCall<__CLFinish, sizeof...(Types) - 1,
-                 decltype(*that)>::call(*that, true);
-    }
-};
-
-template<typename... Types>
-static PYOPENCL_INLINE CLArgPack<rm_ref_t<Types>...>
-make_clargpack(Types&&... args)
-{
-    return CLArgPack<rm_ref_t<Types>...>(std::forward<Types>(args)...);
-}
-
-template<typename... ArgTypes2, typename... ArgTypes>
-static PYOPENCL_INLINE void
-call_guarded(cl_int (CL_API_CALL *func)(ArgTypes...), const char *name, ArgTypes2&&... args)
-{
-    auto argpack = make_clargpack(std::forward<ArgTypes2>(args)...);
-    cl_int status_code = argpack.clcall(func, name);
-    if (status_code != CL_SUCCESS) {
-        throw clerror(name, status_code);
-    }
-    argpack.finish();
-}
-
-template<typename T, typename... ArgTypes, typename... ArgTypes2>
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE T
-call_guarded(T (CL_API_CALL *func)(ArgTypes...), const char *name, ArgTypes2&&... args)
-{
-    cl_int status_code = CL_SUCCESS;
-    auto status_arg = buf_arg(status_code);
-    auto argpack = make_clargpack(std::forward<ArgTypes2>(args)..., status_arg);
-    T res = argpack.clcall(func, name);
-    if (status_code != CL_SUCCESS) {
-        throw clerror(name, status_code);
-    }
-    argpack.finish();
-    return res;
-}
-#define pyopencl_call_guarded(func, ...)    \
-    call_guarded(func, #func, __VA_ARGS__)
-
-static PYOPENCL_INLINE void
-cleanup_print_error(cl_int status_code, const char *name) noexcept
-{
-    std::cerr << ("PyOpenCL WARNING: a clean-up operation failed "
-                  "(dead context maybe?)") << std::endl
-              << name << " failed with code " << status_code << std::endl;
-}
-
-template<typename... ArgTypes, typename... ArgTypes2>
-static PYOPENCL_INLINE void
-call_guarded_cleanup(cl_int (CL_API_CALL *func)(ArgTypes...), const char *name,
-                     ArgTypes2&&... args)
-{
-    auto argpack = make_clargpack(std::forward<ArgTypes2>(args)...);
-    cl_int status_code = argpack.clcall(func, name);
-    if (status_code != CL_SUCCESS) {
-        cleanup_print_error(status_code, name);
-    } else {
-        argpack.finish();
-    }
-}
-#define pyopencl_call_guarded_cleanup(func, ...)    \
-    call_guarded_cleanup(func, #func, __VA_ARGS__)
-
-template<typename Func>
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE error*
-c_handle_error(Func func) noexcept
-{
-    try {
-        func();
-        return nullptr;
-    } catch (const clerror &e) {
-        auto err = (error*)malloc(sizeof(error));
-        err->routine = strdup(e.routine());
-        err->msg = strdup(e.what());
-        err->code = e.code();
-        err->other = 0;
-        return err;
-    } catch (const std::exception &e) {
-        /* non-pyopencl exceptions need to be converted as well */
-        auto err = (error*)malloc(sizeof(error));
-        err->other = 1;
-        err->msg = strdup(e.what());
-        return err;
-    }
-}
-
-template<typename Func>
-static PYOPENCL_INLINE auto
-retry_mem_error(Func func) -> decltype(func())
-{
-    try {
-        return func();
-    } catch (clerror &e) {
-        if (PYOPENCL_LIKELY(!e.is_out_of_memory()) || !py::gc()) {
-            throw;
-        }
-    }
-    return func();
-}
-
-template<typename Func>
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE error*
-c_handle_retry_mem_error(Func &&func) noexcept
-{
-    return c_handle_error([&] {retry_mem_error(std::forward<Func>(func));});
-}
-
-// }}}
-
-#endif
diff --git a/src/c_wrapper/event.cpp b/src/c_wrapper/event.cpp
deleted file mode 100644
index d75c3a324030f5bee50f2067f03e3c7dd40d6e9e..0000000000000000000000000000000000000000
--- a/src/c_wrapper/event.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-#include "event.h"
-#include "command_queue.h"
-#include "context.h"
-#include "pyhelper.h"
-
-#include <atomic>
-
-template class clobj<cl_event>;
-template void print_arg<cl_event>(std::ostream&, const cl_event&, bool);
-template void print_clobj<event>(std::ostream&, const event*);
-template void print_buf<cl_event>(std::ostream&, const cl_event*,
-                                  size_t, ArgType, bool, bool);
-
-class event_private {
-    mutable volatile std::atomic_bool m_finished;
-    virtual void finish() noexcept = 0;
-public:
-    event_private()
-        : m_finished(false)
-    {}
-    virtual
-    ~event_private()
-    {}
-    void
-    call_finish() noexcept
-    {
-        if (m_finished.exchange(true))
-            return;
-        finish();
-    }
-    bool
-    is_finished() noexcept
-    {
-        return m_finished;
-    }
-};
-
-event::event(cl_event event, bool retain, event_private *p)
-    : clobj(event), m_p(p)
-{
-    if (retain) {
-        try {
-            pyopencl_call_guarded(clRetainEvent, PYOPENCL_CL_CASTABLE_THIS);
-        } catch (...) {
-            m_p->call_finish();
-            delete m_p;
-            throw;
-        }
-    }
-}
-
-#if PYOPENCL_CL_VERSION >= 0x1010
-static PYOPENCL_INLINE bool
-release_private_use_cb(event *evt)
-{
-    try {
-        cl_int status = 0;
-        pyopencl_call_guarded(clGetEventInfo, evt,
-                              CL_EVENT_COMMAND_EXECUTION_STATUS,
-                              size_arg(status), nullptr);
-        // Event Callback may not be run immediately when the event
-        // is already completed.
-        if (status <= CL_COMPLETE)
-            return false;
-        cl_context ctx;
-        pyopencl_call_guarded(clGetEventInfo, evt, CL_EVENT_CONTEXT,
-                              size_arg(ctx), nullptr);
-        int major;
-        int minor;
-        context::get_version(ctx, &major, &minor);
-        return (major > 1) || (major >= 1 && minor >= 1);
-    } catch (const clerror &e) {
-        cleanup_print_error(e.code(), e.what());
-        return false;
-    }
-}
-#endif
-
-void
-event::release_private() noexcept
-{
-    if (!m_p)
-        return;
-    if (m_p->is_finished()) {
-        delete m_p;
-        return;
-    }
-#if PYOPENCL_CL_VERSION >= 0x1010 && defined(PYOPENCL_HAVE_EVENT_SET_CALLBACK)
-    if (release_private_use_cb(this)) {
-        try {
-            event_private *p = m_p;
-            set_callback(CL_COMPLETE, [p] (cl_int) {
-                    p->call_finish();
-                    delete p;
-                });
-            return;
-        } catch (const clerror &e) {
-            cleanup_print_error(e.code(), e.what());
-        }
-    }
-#endif
-    wait();
-    delete m_p;
-}
-
-event::~event()
-{
-    release_private();
-    pyopencl_call_guarded_cleanup(clReleaseEvent, PYOPENCL_CL_CASTABLE_THIS);
-}
-
-generic_info
-event::get_info(cl_uint param_name) const
-{
-    switch ((cl_event_info)param_name) {
-    case CL_EVENT_COMMAND_QUEUE:
-        return pyopencl_get_opaque_info(command_queue, Event, PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_EVENT_COMMAND_TYPE:
-        return pyopencl_get_int_info(cl_command_type, Event,
-                                     PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_EVENT_COMMAND_EXECUTION_STATUS:
-        return pyopencl_get_int_info(cl_int, Event, PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_EVENT_REFERENCE_COUNT:
-        return pyopencl_get_int_info(cl_uint, Event, PYOPENCL_CL_CASTABLE_THIS, param_name);
-#if PYOPENCL_CL_VERSION >= 0x1010
-    case CL_EVENT_CONTEXT:
-        return pyopencl_get_opaque_info(context, Event, PYOPENCL_CL_CASTABLE_THIS, param_name);
-#endif
-
-    default:
-        throw clerror("Event.get_info", CL_INVALID_VALUE);
-    }
-}
-
-generic_info
-event::get_profiling_info(cl_profiling_info param) const
-{
-    switch (param) {
-    case CL_PROFILING_COMMAND_QUEUED:
-    case CL_PROFILING_COMMAND_SUBMIT:
-    case CL_PROFILING_COMMAND_START:
-    case CL_PROFILING_COMMAND_END:
-        return pyopencl_get_int_info(cl_ulong, EventProfiling, PYOPENCL_CL_CASTABLE_THIS, param);
-    default:
-        throw clerror("Event.get_profiling_info", CL_INVALID_VALUE);
-    }
-}
-
-void
-event::wait() const
-{
-    pyopencl_call_guarded(clWaitForEvents, len_arg(data()));
-    if (m_p) {
-        m_p->call_finish();
-    }
-}
-
-class nanny_event_private : public event_private {
-    void *m_ward;
-    void finish() noexcept
-    {
-        void *ward = m_ward;
-        m_ward = nullptr;
-        py::deref(ward);
-    }
-public:
-    nanny_event_private(void *ward)
-        : m_ward(nullptr)
-    {
-        m_ward = py::ref(ward);
-    }
-    PYOPENCL_USE_RESULT PYOPENCL_INLINE void*
-    get_ward() const noexcept
-    {
-        return m_ward;
-    }
-};
-
-nanny_event::nanny_event(cl_event evt, bool retain, void *ward)
-    : event(evt, retain, ward ? new nanny_event_private(ward) : nullptr)
-{
-}
-
-PYOPENCL_USE_RESULT void*
-nanny_event::get_ward() const noexcept
-{
-    return (get_p() ? static_cast<nanny_event_private*>(get_p())->get_ward() :
-            nullptr);
-}
-
-#if PYOPENCL_CL_VERSION >= 0x1010
-class user_event : public event {
-public:
-    using event::event;
-    PYOPENCL_INLINE void
-    set_status(cl_int status)
-    {
-        pyopencl_call_guarded(clSetUserEventStatus, PYOPENCL_CL_CASTABLE_THIS, status);
-    }
-};
-#endif
-
-// c wrapper
-
-// Event
-error*
-event__get_profiling_info(clobj_t _evt, cl_profiling_info param,
-                          generic_info *out)
-{
-    auto evt = static_cast<event*>(_evt);
-    return c_handle_error([&] {
-            *out = evt->get_profiling_info(param);
-        });
-}
-
-error*
-event__wait(clobj_t evt)
-{
-    return c_handle_error([&] {
-            static_cast<event*>(evt)->wait();
-        });
-}
-
-
-error*
-event__set_callback(clobj_t _evt, cl_int type, void *pyobj)
-{
-#if PYOPENCL_CL_VERSION >= 0x1010 && defined(PYOPENCL_HAVE_EVENT_SET_CALLBACK)
-    auto evt = static_cast<event*>(_evt);
-    return c_handle_error([&] {
-            pyobj = py::ref(pyobj);
-            try {
-                evt->set_callback(type, [=] (cl_int status) {
-                        py::call(pyobj, status);
-                        py::deref(pyobj);
-                    });
-            } catch (...) {
-                py::deref(pyobj);
-            }
-        });
-#else
-    PYOPENCL_UNSUPPORTED(clSetEventCallback, "CL 1.0 and below and Windows")
-#endif
-}
-
-// Nanny Event
-void*
-nanny_event__get_ward(clobj_t evt)
-{
-    return static_cast<nanny_event*>(evt)->get_ward();
-}
-
-error*
-wait_for_events(const clobj_t *_wait_for, uint32_t num_wait_for)
-{
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    return c_handle_error([&] {
-            pyopencl_call_guarded(clWaitForEvents, wait_for);
-        });
-}
-
-error*
-enqueue_wait_for_events(clobj_t _queue, const clobj_t *_wait_for,
-                        uint32_t num_wait_for)
-{
-    auto queue = static_cast<command_queue*>(_queue);
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    return c_handle_error([&] {
-            pyopencl_call_guarded(clEnqueueWaitForEvents, queue, wait_for);
-        });
-}
-
-#if PYOPENCL_CL_VERSION >= 0x1010
-
-error*
-create_user_event(clobj_t *_evt, clobj_t _ctx)
-{
-    auto ctx = static_cast<context*>(_ctx);
-    return c_handle_error([&] {
-            auto evt = pyopencl_call_guarded(clCreateUserEvent, ctx);
-            *_evt = pyopencl_convert_obj(user_event, clReleaseEvent, evt);
-        });
-}
-
-error*
-user_event__set_status(clobj_t _evt, cl_int status)
-{
-    auto evt = static_cast<user_event*>(_evt);
-    return c_handle_error([&] {
-            evt->set_status(status);
-        });
-}
-
-#endif
diff --git a/src/c_wrapper/event.h b/src/c_wrapper/event.h
deleted file mode 100644
index c6d0dd4b62e43d5b48149760ef4ef099360432b0..0000000000000000000000000000000000000000
--- a/src/c_wrapper/event.h
+++ /dev/null
@@ -1,87 +0,0 @@
-#include "clhelper.h"
-#include <thread>
-
-#ifndef __PYOPENCL_EVENT_H
-#define __PYOPENCL_EVENT_H
-
-// {{{ event
-
-extern template class clobj<cl_event>;
-extern template void print_arg<cl_event>(std::ostream&, const cl_event&, bool);
-extern template void print_buf<cl_event>(std::ostream&, const cl_event*,
-                                         size_t, ArgType, bool, bool);
-
-class event_private;
-
-class event : public clobj<cl_event> {
-    event_private *m_p;
-    // return whether the event need to be released.
-    void release_private() noexcept;
-protected:
-    PYOPENCL_INLINE event_private*
-    get_p() const
-    {
-        return m_p;
-    }
-public:
-    PYOPENCL_DEF_CL_CLASS(EVENT);
-    event(cl_event event, bool retain, event_private *p=nullptr);
-    ~event();
-    generic_info get_info(cl_uint param) const;
-    PYOPENCL_USE_RESULT generic_info
-    get_profiling_info(cl_profiling_info param) const;
-    void wait() const;
-#if PYOPENCL_CL_VERSION >= 0x1010 && defined(PYOPENCL_HAVE_EVENT_SET_CALLBACK)
-    template<typename Func>
-    PYOPENCL_INLINE void
-    set_callback(cl_int type, Func &&_func)
-    {
-        auto func = new rm_ref_t<Func>(std::forward<Func>(_func));
-        try {
-            pyopencl_call_guarded(
-                clSetEventCallback, PYOPENCL_CL_CASTABLE_THIS, type,
-                static_cast<void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *)>(
-                    [] (cl_event, cl_int status, void *data) {
-                        rm_ref_t<Func> *func = static_cast<rm_ref_t<Func>*>(data);
-
-                        // We won't necessarily be able to acquire the GIL inside this
-                        // handler without deadlocking. Create a thread that *can*
-                        // wait.
-
-                        std::thread t([func, status] () {
-                                (*func)(status);
-                                delete func;
-                            });
-                        t.detach();
-
-                    }), (void*)func);
-        } catch (...) {
-            delete func;
-            throw;
-        }
-    }
-#endif
-};
-static PYOPENCL_INLINE auto
-event_out(clobj_t *ret) -> decltype(pyopencl_outarg(event, ret, clReleaseEvent))
-{
-    return pyopencl_outarg(event, ret, clReleaseEvent);
-}
-
-extern template void print_clobj<event>(std::ostream&, const event*);
-
-class nanny_event : public event {
-public:
-    nanny_event(cl_event evt, bool retain, void *ward=nullptr);
-    PYOPENCL_USE_RESULT void *get_ward() const noexcept;
-};
-static PYOPENCL_INLINE auto
-nanny_event_out(clobj_t *ret, void *ward)
-    -> decltype(pyopencl_outarg(nanny_event, ret, clReleaseEvent, ward))
-{
-    return pyopencl_outarg(nanny_event, ret, clReleaseEvent, ward);
-}
-
-// }}}
-
-#endif
diff --git a/src/c_wrapper/function.h b/src/c_wrapper/function.h
deleted file mode 100644
index 5d1a604c309e05d58b0e361971470d8db3c49571..0000000000000000000000000000000000000000
--- a/src/c_wrapper/function.h
+++ /dev/null
@@ -1,121 +0,0 @@
-#include <functional>
-#include <utility>
-
-#ifndef __PYOPENCL_FUNCTION_H
-#define __PYOPENCL_FUNCTION_H
-
-#if defined __GNUC__ &&  __GNUC__ > 3
-#define PYOPENCL_INLINE inline __attribute__((__always_inline__))
-#else
-#define PYOPENCL_INLINE inline
-#endif
-
-template<typename T>
-using rm_ref_t = typename std::remove_reference<T>::type;
-template<typename T>
-using rm_const_t = typename std::remove_const<T>::type;
-template<bool B, class T = void>
-using enable_if_t = typename std::enable_if<B, T>::type;
-
-template<int...>
-struct seq {
-};
-
-template<int N, int... S>
-struct gens : gens<N - 1, N - 1, S...> {
-};
-
-template<int ...S>
-struct gens<0, S...> {
-    typedef seq<S...> type;
-};
-
-template<typename Function, int... S, typename... Arg2>
-static PYOPENCL_INLINE auto
-_call_func(Function func, seq<S...>, std::tuple<Arg2...> &args)
-    -> decltype(func(std::forward<Arg2>(std::get<S>(args))...))
-{
-    return func(static_cast<Arg2&&>(std::get<S>(args))...);
-}
-
-template<typename Function, typename T>
-static PYOPENCL_INLINE auto
-call_tuple(Function &&func, T &&args)
-    -> decltype(_call_func(std::forward<Function>(func),
-                           typename gens<std::tuple_size<T>::value>::type(),
-                           args))
-{
-    return _call_func(std::forward<Function>(func),
-                      typename gens<std::tuple_size<T>::value>::type(), args);
-}
-
-template<template<typename...> class Convert, typename... Types>
-using _ArgPackBase = std::tuple<Convert<typename std::remove_reference<Types>::type>...>;
-
-template<template<typename...> class Convert, typename... Types>
-class ArgPack : public _ArgPackBase<Convert, Types...> {
-public:
-    typedef _ArgPackBase<Convert, Types...> tuple_base;
-private:
-    template<typename T>
-    static PYOPENCL_INLINE std::tuple<T>
-    ensure_tuple(T &&v)
-    {
-        return std::tuple<T>(std::forward<T>(v));
-    }
-    template<typename... T>
-    static PYOPENCL_INLINE std::tuple<T...>
-    ensure_tuple(std::tuple<T...> &&t)
-    {
-        return t;
-    }
-
-    template<typename T>
-    using ArgConvert = Convert<rm_ref_t<T> >;
-    template<template<typename...> class Getter, int... S>
-    PYOPENCL_INLINE auto
-    __get(seq<S...>)
-#ifndef _MSC_VER
-    -> decltype(std::tuple_cat(
-                    ensure_tuple(Getter<ArgConvert<Types> >::get(
-                                     std::get<S>(*(tuple_base*)this)))...))
-#endif
-    {
-        return std::tuple_cat(
-            ensure_tuple(Getter<ArgConvert<Types> >::get(
-                             std::get<S>(*(tuple_base*)this)))...);
-    }
-public:
-    template<typename... Types2>
-    ArgPack(Types2&&... arg_orig)
-        : tuple_base(ArgConvert<rm_ref_t<Types> >(arg_orig)...)
-    {
-    }
-    ArgPack(ArgPack<Convert, Types...> &&other)
-        : tuple_base(static_cast<tuple_base&&>(other))
-    {
-    }
-    // GCC Bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=57543
-    template<template<typename...> class Getter>
-    PYOPENCL_INLINE auto
-    get() -> decltype(this->__get<Getter>(
-                          typename gens<sizeof...(Types)>::type()))
-    {
-        return __get<Getter>(typename gens<sizeof...(Types)>::type());
-    }
-    template<template<typename...> class Getter, typename Func>
-    PYOPENCL_INLINE auto
-    call(Func func) -> decltype(call_tuple(func, this->get<Getter>()))
-    {
-        return call_tuple(func, this->get<Getter>());
-    }
-};
-
-template<template<typename...> class Convert, typename... Types>
-static PYOPENCL_INLINE ArgPack<Convert, rm_ref_t<Types>...>
-make_argpack(Types&&... args)
-{
-    return ArgPack<Convert, rm_ref_t<Types>...>(std::forward<Types>(args)...);
-}
-
-#endif
diff --git a/src/c_wrapper/gl_obj.cpp b/src/c_wrapper/gl_obj.cpp
deleted file mode 100644
index bd7edf31d8ce772adae21047ab34e3eb925f1482..0000000000000000000000000000000000000000
--- a/src/c_wrapper/gl_obj.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-#include "gl_obj.h"
-#include "context.h"
-#include "command_queue.h"
-#include "event.h"
-#include "clhelper.h"
-
-#ifdef HAVE_GL
-
-template void print_clobj<gl_buffer>(std::ostream&, const gl_buffer*);
-template void print_clobj<gl_renderbuffer>(std::ostream&,
-                                           const gl_renderbuffer*);
-
-generic_info
-gl_texture::get_gl_texture_info(cl_gl_texture_info param_name) const
-{
-    switch (param_name) {
-    case CL_GL_TEXTURE_TARGET:
-        return pyopencl_get_int_info(GLenum, GLTexture, PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_GL_MIPMAP_LEVEL:
-        return pyopencl_get_int_info(GLint, GLTexture, PYOPENCL_CL_CASTABLE_THIS, param_name);
-    default:
-        throw clerror("MemoryObject.get_gl_texture_info", CL_INVALID_VALUE);
-    }
-}
-
-typedef cl_int (CL_API_CALL *clEnqueueGLObjectFunc)(cl_command_queue, cl_uint,
-                                        const cl_mem*, cl_uint,
-                                        const cl_event*, cl_event*);
-
-static PYOPENCL_INLINE void
-enqueue_gl_objects(clEnqueueGLObjectFunc func, const char *name,
-                   clobj_t *evt, command_queue *cq, const clobj_t *mem_objects,
-                   uint32_t num_mem_objects, const clobj_t *wait_for,
-                   uint32_t num_wait_for)
-{
-    const auto _wait_for = buf_from_class<event>(wait_for, num_wait_for);
-    const auto _mem_objs = buf_from_class<memory_object>(
-        mem_objects, num_mem_objects);
-    call_guarded(func, name, cq, _mem_objs, _wait_for, event_out(evt));
-}
-#define enqueue_gl_objects(what, ...)                       \
-    enqueue_gl_objects(clEnqueue##what##GLObjects,              \
-                       "clEnqueue" #what "GLObjects", __VA_ARGS__)
-
-// c wrapper
-
-error*
-create_from_gl_texture(clobj_t *ptr, clobj_t _ctx, cl_mem_flags flags,
-                       GLenum texture_target, GLint miplevel,
-                       GLuint texture)
-{
-#if PYOPENCL_CL_VERSION >= 0x1020
-    auto ctx = static_cast<context*>(_ctx);
-    return c_handle_error([&] {
-            cl_mem mem = pyopencl_call_guarded(clCreateFromGLTexture,
-                                               ctx, flags, texture_target, miplevel, texture);
-            *ptr = pyopencl_convert_obj(gl_texture, clReleaseMemObject, mem);
-        });
-#else
-    PYOPENCL_UNSUPPORTED(clCreateFromGLTexture, "CL 1.1")
-#endif
-}
-
-error*
-create_from_gl_buffer(clobj_t *ptr, clobj_t _ctx,
-                      cl_mem_flags flags, GLuint bufobj)
-{
-    auto ctx = static_cast<context*>(_ctx);
-    return c_handle_error([&] {
-            cl_mem mem = pyopencl_call_guarded(clCreateFromGLBuffer,
-                                               ctx, flags, bufobj);
-            *ptr = pyopencl_convert_obj(gl_buffer, clReleaseMemObject, mem);
-        });
-}
-
-error*
-create_from_gl_renderbuffer(clobj_t *ptr, clobj_t _ctx,
-                            cl_mem_flags flags, GLuint bufobj)
-{
-    auto ctx = static_cast<context*>(_ctx);
-    return c_handle_error([&] {
-            cl_mem mem = pyopencl_call_guarded(clCreateFromGLRenderbuffer,
-                                               ctx, flags, bufobj);
-            *ptr = pyopencl_convert_obj(gl_renderbuffer,
-                                        clReleaseMemObject, mem);
-        });
-}
-
-error*
-enqueue_acquire_gl_objects(clobj_t *evt, clobj_t queue,
-                           const clobj_t *mem_objects,
-                           uint32_t num_mem_objects,
-                           const clobj_t *wait_for, uint32_t num_wait_for)
-{
-    return c_handle_error([&] {
-            enqueue_gl_objects(
-                Acquire, evt, static_cast<command_queue*>(queue),
-                mem_objects, num_mem_objects, wait_for, num_wait_for);
-        });
-}
-
-error*
-enqueue_release_gl_objects(clobj_t *evt, clobj_t queue,
-                           const clobj_t *mem_objects,
-                           uint32_t num_mem_objects,
-                           const clobj_t *wait_for, uint32_t num_wait_for)
-{
-    return c_handle_error([&] {
-            enqueue_gl_objects(
-                Release, evt, static_cast<command_queue*>(queue),
-                mem_objects, num_mem_objects, wait_for, num_wait_for);
-        });
-}
-
-error*
-get_gl_object_info(clobj_t mem, cl_gl_object_type *otype, GLuint *gl_name)
-{
-    auto globj = static_cast<memory_object*>(mem);
-    return c_handle_error([&] {
-            pyopencl_call_guarded(clGetGLObjectInfo, globj, buf_arg(*otype),
-                                  buf_arg(*gl_name));
-        });
-}
-
-#endif
-
-int
-have_gl()
-{
-#ifdef HAVE_GL
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-cl_context_properties
-get_apple_cgl_share_group()
-{
-#if (defined(__APPLE__) && !defined(PYOPENCL_APPLE_USE_CL_H))
-    #ifdef HAVE_GL
-        CGLContextObj kCGLContext = CGLGetCurrentContext();
-        CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext);
-
-        return (cl_context_properties)kCGLShareGroup;
-    #else
-        throw clerror("get_apple_cgl_share_group unavailable: "
-            "GL interop not compiled",
-            CL_INVALID_VALUE);
-    #endif
-#else
-    throw clerror("get_apple_cgl_share_group unavailable: non-Apple platform",
-        CL_INVALID_VALUE);
-#endif /* __APPLE__ */
-}
diff --git a/src/c_wrapper/gl_obj.h b/src/c_wrapper/gl_obj.h
deleted file mode 100644
index 9f47e19b2dab3f93b35f0cd0c65f39471339d6f4..0000000000000000000000000000000000000000
--- a/src/c_wrapper/gl_obj.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "image.h"
-
-#ifndef __PYOPENCL_GL_OBJ_H
-#define __PYOPENCL_GL_OBJ_H
-
-#ifdef HAVE_GL
-
-// {{{ gl interop
-
-class gl_buffer : public memory_object {
-public:
-    PYOPENCL_DEF_CL_CLASS(GL_BUFFER);
-    PYOPENCL_INLINE
-    gl_buffer(cl_mem mem, bool retain)
-        : memory_object(mem, retain)
-    {}
-};
-
-class gl_renderbuffer : public memory_object {
-public:
-    PYOPENCL_DEF_CL_CLASS(GL_RENDERBUFFER);
-    PYOPENCL_INLINE
-    gl_renderbuffer(cl_mem mem, bool retain)
-        : memory_object(mem, retain)
-    {}
-};
-
-extern template void print_clobj<gl_buffer>(std::ostream&, const gl_buffer*);
-extern template void print_clobj<gl_renderbuffer>(std::ostream&,
-                                                  const gl_renderbuffer*);
-
-class gl_texture : public image {
-  public:
-    PYOPENCL_INLINE
-    gl_texture(cl_mem mem, bool retain)
-      : image(mem, retain)
-    {}
-    PYOPENCL_USE_RESULT generic_info
-    get_gl_texture_info(cl_gl_texture_info param_name) const;
-};
-
-// }}}
-
-#endif
-
-#endif
diff --git a/src/c_wrapper/image.cpp b/src/c_wrapper/image.cpp
deleted file mode 100644
index 6f571f3208d13f3fd80e22db604458f7a7ca2617..0000000000000000000000000000000000000000
--- a/src/c_wrapper/image.cpp
+++ /dev/null
@@ -1,237 +0,0 @@
-#include "image.h"
-#include "context.h"
-#include "command_queue.h"
-#include "event.h"
-#include "buffer.h"
-
-template void print_clobj<image>(std::ostream&, const image*);
-
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE image*
-new_image(cl_mem mem, const cl_image_format *fmt)
-{
-    return pyopencl_convert_obj(image, clReleaseMemObject, mem, fmt);
-}
-
-generic_info
-image::get_image_info(cl_image_info param) const
-{
-    switch (param) {
-    case CL_IMAGE_FORMAT:
-        return pyopencl_get_int_info(cl_image_format, Image, PYOPENCL_CL_CASTABLE_THIS, param);
-    case CL_IMAGE_ELEMENT_SIZE:
-    case CL_IMAGE_ROW_PITCH:
-    case CL_IMAGE_SLICE_PITCH:
-    case CL_IMAGE_WIDTH:
-    case CL_IMAGE_HEIGHT:
-    case CL_IMAGE_DEPTH:
-#if PYOPENCL_CL_VERSION >= 0x1020
-    case CL_IMAGE_ARRAY_SIZE:
-#endif
-        return pyopencl_get_int_info(size_t, Image, PYOPENCL_CL_CASTABLE_THIS, param);
-
-#if PYOPENCL_CL_VERSION >= 0x1020
-        // TODO:
-        //    case CL_IMAGE_BUFFER:
-        //      {
-        //        cl_mem param_value;
-        //        PYOPENCL_CALL_GUARDED(clGetImageInfo, (this, param, sizeof(param_value), &param_value, 0));
-        //        if (param_value == 0)
-        //               {
-        //                 // no associated memory object? no problem.
-        //                 return py::object();
-        //               }
-        //        return create_mem_object_wrapper(param_value);
-        //      }
-    case CL_IMAGE_NUM_MIP_LEVELS:
-    case CL_IMAGE_NUM_SAMPLES:
-        return pyopencl_get_int_info(cl_uint, Image, PYOPENCL_CL_CASTABLE_THIS, param);
-#endif
-    default:
-        throw clerror("Image.get_image_info", CL_INVALID_VALUE);
-    }
-}
-
-// c wrapper
-
-// Image
-error*
-create_image_2d(clobj_t *img, clobj_t _ctx, cl_mem_flags flags,
-                cl_image_format *fmt, size_t width, size_t height,
-                size_t pitch, void *buf)
-{
-    auto ctx = static_cast<context*>(_ctx);
-    return c_handle_retry_mem_error([&] {
-            auto mem = pyopencl_call_guarded(clCreateImage2D, ctx, flags, fmt,
-                                             width, height, pitch, buf);
-            *img = new_image(mem, fmt);
-        });
-}
-
-error*
-create_image_3d(clobj_t *img, clobj_t _ctx, cl_mem_flags flags,
-                cl_image_format *fmt, size_t width, size_t height,
-                size_t depth, size_t pitch_x, size_t pitch_y, void *buf)
-{
-    auto ctx = static_cast<context*>(_ctx);
-    return c_handle_retry_mem_error([&] {
-            auto mem = pyopencl_call_guarded(clCreateImage3D, ctx, flags, fmt,
-                                             width, height, depth, pitch_x,
-                                             pitch_y, buf);
-            *img = new_image(mem, fmt);
-        });
-}
-
-
-error*
-create_image_from_desc(clobj_t *img, clobj_t _ctx, cl_mem_flags flags,
-                       cl_image_format *fmt, cl_image_desc *desc, void *buf)
-{
-#if PYOPENCL_CL_VERSION >= 0x1020
-    auto ctx = static_cast<context*>(_ctx);
-    return c_handle_error([&] {
-            auto mem = pyopencl_call_guarded(clCreateImage, ctx, flags, fmt,
-                                             desc, buf);
-            *img = new_image(mem, fmt);
-        });
-#else
-    PYOPENCL_UNSUPPORTED(clCreateImage, "CL 1.1 and below")
-#endif
-}
-
-
-error*
-image__get_image_info(clobj_t _img, cl_image_info param, generic_info *out)
-{
-    auto img = static_cast<image*>(_img);
-    return c_handle_error([&] {
-            *out = img->get_image_info(param);
-        });
-}
-
-type_t
-image__get_fill_type(clobj_t img)
-{
-    return static_cast<image*>(img)->get_fill_type();
-}
-
-error*
-enqueue_read_image(clobj_t *evt, clobj_t _queue, clobj_t _mem,
-                   const size_t *_orig, size_t orig_l,
-                   const size_t *_reg, size_t reg_l, void *buf,
-                   size_t row_pitch, size_t slice_pitch,
-                   const clobj_t *_wait_for, uint32_t num_wait_for,
-                   int block, void *pyobj)
-{
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    auto queue = static_cast<command_queue*>(_queue);
-    auto img = static_cast<image*>(_mem);
-    ConstBuffer<size_t, 3> orig(_orig, orig_l);
-    ConstBuffer<size_t, 3> reg(_reg, reg_l, 1);
-    return c_handle_retry_mem_error([&] {
-            pyopencl_call_guarded(clEnqueueReadImage, queue, img, bool(block),
-                                  orig, reg, row_pitch, slice_pitch, buf,
-                                  wait_for, nanny_event_out(evt, pyobj));
-        });
-}
-
-error*
-enqueue_copy_image(clobj_t *evt, clobj_t _queue, clobj_t _src, clobj_t _dst,
-                   const size_t *_src_orig, size_t src_orig_l,
-                   const size_t *_dst_orig, size_t dst_orig_l,
-                   const size_t *_reg, size_t reg_l,
-                   const clobj_t *_wait_for, uint32_t num_wait_for)
-{
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    auto queue = static_cast<command_queue*>(_queue);
-    auto src = static_cast<image*>(_src);
-    auto dst = static_cast<image*>(_dst);
-    ConstBuffer<size_t, 3> src_orig(_src_orig, src_orig_l);
-    ConstBuffer<size_t, 3> dst_orig(_dst_orig, dst_orig_l);
-    ConstBuffer<size_t, 3> reg(_reg, reg_l, 1);
-    return c_handle_retry_mem_error([&] {
-            pyopencl_call_guarded(clEnqueueCopyImage, queue, src, dst, src_orig,
-                                  dst_orig, reg, wait_for, event_out(evt));
-        });
-}
-
-error*
-enqueue_write_image(clobj_t *evt, clobj_t _queue, clobj_t _mem,
-                    const size_t *_orig, size_t orig_l,
-                    const size_t *_reg, size_t reg_l,
-                    const void *buf, size_t row_pitch, size_t slice_pitch,
-                    const clobj_t *_wait_for, uint32_t num_wait_for,
-                    int block, void *pyobj)
-{
-    auto queue = static_cast<command_queue*>(_queue);
-    auto img = static_cast<image*>(_mem);
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    ConstBuffer<size_t, 3> orig(_orig, orig_l);
-    ConstBuffer<size_t, 3> reg(_reg, reg_l, 1);
-    return c_handle_retry_mem_error([&] {
-            pyopencl_call_guarded(clEnqueueWriteImage, queue, img, bool(block),
-                                  orig, reg, row_pitch, slice_pitch, buf,
-                                  wait_for, nanny_event_out(evt, pyobj));
-        });
-}
-
-error*
-enqueue_fill_image(clobj_t *evt, clobj_t _queue, clobj_t mem,
-                   const void *color, const size_t *_orig, size_t orig_l,
-                   const size_t *_reg, size_t reg_l,
-                   const clobj_t *_wait_for, uint32_t num_wait_for)
-{
-#if PYOPENCL_CL_VERSION >= 0x1020
-    // TODO debug color
-    auto queue = static_cast<command_queue*>(_queue);
-    auto img = static_cast<image*>(mem);
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    ConstBuffer<size_t, 3> orig(_orig, orig_l);
-    ConstBuffer<size_t, 3> reg(_reg, reg_l, 1);
-    return c_handle_retry_mem_error([&] {
-            pyopencl_call_guarded(clEnqueueFillImage, queue, img, color, orig,
-                                  reg, wait_for, event_out(evt));
-        });
-#else
-    PYOPENCL_UNSUPPORTED(clEnqueueFillImage, "CL 1.1 and below")
-#endif
-}
-
-// {{{ image transfers
-
-error*
-enqueue_copy_image_to_buffer(clobj_t *evt, clobj_t _queue, clobj_t _src,
-                             clobj_t _dst, const size_t *_orig, size_t orig_l,
-                             const size_t *_reg, size_t reg_l, size_t offset,
-                             const clobj_t *_wait_for, uint32_t num_wait_for)
-{
-    auto queue = static_cast<command_queue*>(_queue);
-    auto src = static_cast<image*>(_src);
-    auto dst = static_cast<buffer*>(_dst);
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    ConstBuffer<size_t, 3> orig(_orig, orig_l);
-    ConstBuffer<size_t, 3> reg(_reg, reg_l, 1);
-    return c_handle_retry_mem_error([&] {
-            pyopencl_call_guarded(clEnqueueCopyImageToBuffer, queue, src, dst,
-                                  orig, reg, offset, wait_for, event_out(evt));
-        });
-}
-
-error*
-enqueue_copy_buffer_to_image(clobj_t *evt, clobj_t _queue, clobj_t _src,
-                             clobj_t _dst, size_t offset, const size_t *_orig,
-                             size_t orig_l, const size_t *_reg, size_t reg_l,
-                             const clobj_t *_wait_for, uint32_t num_wait_for)
-{
-    auto queue = static_cast<command_queue*>(_queue);
-    auto src = static_cast<buffer*>(_src);
-    auto dst = static_cast<image*>(_dst);
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    ConstBuffer<size_t, 3> orig(_orig, orig_l);
-    ConstBuffer<size_t, 3> reg(_reg, reg_l, 1);
-    return c_handle_retry_mem_error([&] {
-            pyopencl_call_guarded(clEnqueueCopyBufferToImage, queue, src, dst,
-                                  offset, orig, reg, wait_for, event_out(evt));
-        });
-}
-
-// }}}
diff --git a/src/c_wrapper/image.h b/src/c_wrapper/image.h
deleted file mode 100644
index 7d29909c9d30896ae915cb32138561ab115f0950..0000000000000000000000000000000000000000
--- a/src/c_wrapper/image.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#include "memory_object.h"
-#include "clhelper.h"
-
-#ifndef __PYOPENCL_IMAGE_H
-#define __PYOPENCL_IMAGE_H
-
-// {{{ image
-
-class image : public memory_object {
-private:
-    cl_image_format m_format;
-public:
-    PYOPENCL_DEF_CL_CLASS(IMAGE);
-    PYOPENCL_INLINE
-    image(cl_mem mem, bool retain, const cl_image_format *fmt=0)
-        : memory_object(mem, retain), m_format(fmt ? *fmt : cl_image_format())
-    {}
-    PYOPENCL_INLINE const cl_image_format&
-    format()
-    {
-        if (!m_format.image_channel_data_type) {
-            pyopencl_call_guarded(clGetImageInfo, PYOPENCL_CL_CASTABLE_THIS, CL_IMAGE_FORMAT,
-                                  size_arg(m_format), nullptr);
-        }
-        return m_format;
-    }
-    PYOPENCL_USE_RESULT generic_info get_image_info(cl_image_info param) const;
-    PYOPENCL_INLINE type_t
-    get_fill_type()
-    {
-        switch (format().image_channel_data_type) {
-        case CL_SIGNED_INT8:
-        case CL_SIGNED_INT16:
-        case CL_SIGNED_INT32:
-            return TYPE_INT;
-        case CL_UNSIGNED_INT8:
-        case CL_UNSIGNED_INT16:
-        case CL_UNSIGNED_INT32:
-            return TYPE_UINT;
-        default:
-            return TYPE_FLOAT;
-        }
-    }
-};
-
-extern template void print_clobj<image>(std::ostream&, const image*);
-
-// }}}
-
-#endif
diff --git a/src/c_wrapper/kernel.cpp b/src/c_wrapper/kernel.cpp
deleted file mode 100644
index 817e10619727ae7d060c40d3096b637ec5923629..0000000000000000000000000000000000000000
--- a/src/c_wrapper/kernel.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-#include "kernel.h"
-#include "context.h"
-#include "device.h"
-#include "program.h"
-#include "memory_object.h"
-#include "sampler.h"
-#include "command_queue.h"
-#include "event.h"
-#include "clhelper.h"
-
-template class clobj<cl_kernel>;
-template void print_arg<cl_kernel>(std::ostream&, const cl_kernel&, bool);
-template void print_clobj<kernel>(std::ostream&, const kernel*);
-template void print_buf<cl_kernel>(std::ostream&, const cl_kernel*,
-                                   size_t, ArgType, bool, bool);
-
-kernel::~kernel()
-{
-    pyopencl_call_guarded_cleanup(clReleaseKernel, PYOPENCL_CL_CASTABLE_THIS);
-}
-
-generic_info
-kernel::get_info(cl_uint param) const
-{
-    switch ((cl_kernel_info)param) {
-    case CL_KERNEL_FUNCTION_NAME:
-        return pyopencl_get_str_info(Kernel, PYOPENCL_CL_CASTABLE_THIS, param);
-    case CL_KERNEL_NUM_ARGS:
-    case CL_KERNEL_REFERENCE_COUNT:
-        return pyopencl_get_int_info(cl_uint, Kernel, PYOPENCL_CL_CASTABLE_THIS, param);
-    case CL_KERNEL_CONTEXT:
-        return pyopencl_get_opaque_info(context, Kernel, PYOPENCL_CL_CASTABLE_THIS, param);
-    case CL_KERNEL_PROGRAM:
-        return pyopencl_get_opaque_info(program, Kernel, PYOPENCL_CL_CASTABLE_THIS, param);
-#if PYOPENCL_CL_VERSION >= 0x1020
-    case CL_KERNEL_ATTRIBUTES:
-        return pyopencl_get_str_info(Kernel, PYOPENCL_CL_CASTABLE_THIS, param);
-#endif
-    default:
-        throw clerror("Kernel.get_info", CL_INVALID_VALUE);
-    }
-}
-
-generic_info
-kernel::get_work_group_info(cl_kernel_work_group_info param,
-                            const device *dev) const
-{
-    switch (param) {
-#if PYOPENCL_CL_VERSION >= 0x1010
-    case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
-#endif
-    case CL_KERNEL_WORK_GROUP_SIZE:
-        return pyopencl_get_int_info(size_t, KernelWorkGroup, PYOPENCL_CL_CASTABLE_THIS, dev, param);
-    case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
-        return pyopencl_get_array_info(size_t, KernelWorkGroup,
-                                       PYOPENCL_CL_CASTABLE_THIS, dev, param);
-    case CL_KERNEL_LOCAL_MEM_SIZE:
-#if PYOPENCL_CL_VERSION >= 0x1010
-    case CL_KERNEL_PRIVATE_MEM_SIZE:
-#endif
-        return pyopencl_get_int_info(cl_ulong, KernelWorkGroup,
-                                     PYOPENCL_CL_CASTABLE_THIS, dev, param);
-    default:
-        throw clerror("Kernel.get_work_group_info", CL_INVALID_VALUE);
-    }
-}
-
-#if PYOPENCL_CL_VERSION >= 0x1020
-PYOPENCL_USE_RESULT generic_info
-kernel::get_arg_info(cl_uint idx, cl_kernel_arg_info param) const
-{
-    switch (param) {
-    case CL_KERNEL_ARG_ADDRESS_QUALIFIER:
-        return pyopencl_get_int_info(cl_kernel_arg_address_qualifier,
-                                     KernelArg, PYOPENCL_CL_CASTABLE_THIS, idx, param);
-    case CL_KERNEL_ARG_ACCESS_QUALIFIER:
-        return pyopencl_get_int_info(cl_kernel_arg_access_qualifier,
-                                     KernelArg, PYOPENCL_CL_CASTABLE_THIS, idx, param);
-    case CL_KERNEL_ARG_TYPE_QUALIFIER:
-        return pyopencl_get_int_info(cl_kernel_arg_type_qualifier,
-                                     KernelArg, PYOPENCL_CL_CASTABLE_THIS, idx, param);
-    case CL_KERNEL_ARG_TYPE_NAME:
-    case CL_KERNEL_ARG_NAME:
-        return pyopencl_get_str_info(KernelArg, PYOPENCL_CL_CASTABLE_THIS, idx, param);
-    default:
-        throw clerror("Kernel.get_arg_info", CL_INVALID_VALUE);
-    }
-}
-#endif
-
-// c wrapper
-
-// Kernel
-error*
-create_kernel(clobj_t *knl, clobj_t _prog, const char *name)
-{
-    auto prog = static_cast<const program*>(_prog);
-    return c_handle_error([&] {
-            *knl = new kernel(pyopencl_call_guarded(clCreateKernel, prog,
-                                                    name), false);
-        });
-}
-
-error*
-kernel__set_arg_null(clobj_t _knl, cl_uint arg_index)
-{
-    auto knl = static_cast<kernel*>(_knl);
-    return c_handle_error([&] {
-            const cl_mem m = 0;
-            pyopencl_call_guarded(clSetKernelArg, knl, arg_index, size_arg(m));
-        });
-}
-
-error*
-kernel__set_arg_mem(clobj_t _knl, cl_uint arg_index, clobj_t _mem)
-{
-    auto knl = static_cast<kernel*>(_knl);
-    auto mem = static_cast<memory_object*>(_mem);
-    return c_handle_error([&] {
-            pyopencl_call_guarded(clSetKernelArg, knl, arg_index,
-                                  size_arg(mem->data()));
-        });
-}
-
-error*
-kernel__set_arg_sampler(clobj_t _knl, cl_uint arg_index, clobj_t _samp)
-{
-    auto knl = static_cast<kernel*>(_knl);
-    auto samp = static_cast<sampler*>(_samp);
-    return c_handle_error([&] {
-            pyopencl_call_guarded(clSetKernelArg, knl, arg_index,
-                                  size_arg(samp->data()));
-        });
-}
-
-error*
-kernel__set_arg_buf(clobj_t _knl, cl_uint arg_index,
-                    const void *buffer, size_t size)
-{
-    auto knl = static_cast<kernel*>(_knl);
-    return c_handle_error([&] {
-            pyopencl_call_guarded(clSetKernelArg, knl, arg_index,
-                                  size_arg(buffer, size));
-        });
-}
-
-error*
-kernel__set_arg_svm_pointer(clobj_t _knl, cl_uint arg_index, void *value)
-{
-#if PYOPENCL_CL_VERSION >= 0x2000
-    auto knl = static_cast<kernel*>(_knl);
-    return c_handle_error([&] {
-            pyopencl_call_guarded(clSetKernelArgSVMPointer, knl, arg_index, value);
-        });
-#else
-    PYOPENCL_UNSUPPORTED_BEFORE(clSetKernelArgSVMPointer, "CL 2.0")
-#endif
-}
-
-error*
-kernel__get_work_group_info(clobj_t _knl, cl_kernel_work_group_info param,
-                            clobj_t _dev, generic_info *out)
-{
-    auto knl = static_cast<kernel*>(_knl);
-    auto dev = static_cast<device*>(_dev);
-    return c_handle_error([&] {
-            *out = knl->get_work_group_info(param, dev);
-        });
-}
-
-error*
-kernel__get_arg_info(clobj_t _knl, cl_uint idx, cl_kernel_arg_info param,
-                     generic_info *out)
-{
-#if PYOPENCL_CL_VERSION >= 0x1020
-    auto knl = static_cast<kernel*>(_knl);
-    return c_handle_error([&] {
-            *out = knl->get_arg_info(idx, param);
-        });
-#else
-    PYOPENCL_UNSUPPORTED(clKernelGetArgInfo, "CL 1.1 and below")
-#endif
-}
-
-error*
-enqueue_nd_range_kernel(clobj_t *evt, clobj_t _queue, clobj_t _knl,
-                        cl_uint work_dim, const size_t *global_work_offset,
-                        const size_t *global_work_size,
-                        const size_t *local_work_size,
-                        const clobj_t *_wait_for, uint32_t num_wait_for)
-{
-    auto queue = static_cast<command_queue*>(_queue);
-    auto knl = static_cast<kernel*>(_knl);
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    return c_handle_retry_mem_error([&] {
-            pyopencl_call_guarded(clEnqueueNDRangeKernel, queue, knl, work_dim,
-                                  global_work_offset, global_work_size,
-                                  local_work_size, wait_for, event_out(evt));
-        });
-}
-
-error*
-enqueue_task(clobj_t *evt, clobj_t _queue, clobj_t _knl,
-             const clobj_t *_wait_for, uint32_t num_wait_for)
-{
-    auto queue = static_cast<command_queue*>(_queue);
-    auto knl = static_cast<kernel*>(_knl);
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    return c_handle_retry_mem_error([&] {
-            pyopencl_call_guarded(clEnqueueTask, queue, knl, wait_for,
-                                  event_out(evt));
-        });
-}
diff --git a/src/c_wrapper/kernel.h b/src/c_wrapper/kernel.h
deleted file mode 100644
index 5db1a0cc53bcd8171cb6973182d5e82de4ddc479..0000000000000000000000000000000000000000
--- a/src/c_wrapper/kernel.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#include "error.h"
-
-#ifndef __PYOPENCL_KERNEL_H
-#define __PYOPENCL_KERNEL_H
-
-class device;
-
-// {{{ kernel
-
-extern template class clobj<cl_kernel>;
-extern template void print_arg<cl_kernel>(std::ostream&,
-                                          const cl_kernel&, bool);
-extern template void print_buf<cl_kernel>(std::ostream&, const cl_kernel*,
-                                          size_t, ArgType, bool, bool);
-
-class kernel : public clobj<cl_kernel> {
-public:
-    PYOPENCL_DEF_CL_CLASS(KERNEL);
-    PYOPENCL_INLINE
-    kernel(cl_kernel knl, bool retain)
-        : clobj(knl)
-    {
-        if (retain) {
-            pyopencl_call_guarded(clRetainKernel, PYOPENCL_CL_CASTABLE_THIS);
-        }
-    }
-    ~kernel();
-    generic_info get_info(cl_uint param) const;
-
-    PYOPENCL_USE_RESULT generic_info
-    get_work_group_info(cl_kernel_work_group_info param,
-                        const device *dev) const;
-
-#if PYOPENCL_CL_VERSION >= 0x1020
-    PYOPENCL_USE_RESULT generic_info
-    get_arg_info(cl_uint idx, cl_kernel_arg_info param) const;
-#endif
-};
-
-extern template void print_clobj<kernel>(std::ostream&, const kernel*);
-
-// }}}
-
-#endif
diff --git a/src/c_wrapper/memory_map.cpp b/src/c_wrapper/memory_map.cpp
deleted file mode 100644
index 068274df6d6d4c306682ab372b560817d454037a..0000000000000000000000000000000000000000
--- a/src/c_wrapper/memory_map.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-#include "memory_map.h"
-#include "image.h"
-#include "buffer.h"
-#include "event.h"
-#include "clhelper.h"
-
-template class clobj<void*>;
-template void print_arg<void*>(std::ostream&, void *const&, bool);
-template void print_buf<void*>(std::ostream&, void *const*,
-                               size_t, ArgType, bool, bool);
-
-memory_map::~memory_map()
-{
-    if (!m_valid.exchange(false))
-        return;
-    pyopencl_call_guarded_cleanup(clEnqueueUnmapMemObject, m_queue,
-                                  m_mem, PYOPENCL_CL_CASTABLE_THIS, 0, nullptr, nullptr);
-}
-
-void
-memory_map::release(clobj_t *evt, const command_queue *queue,
-                    const clobj_t *_wait_for, uint32_t num_wait_for) const
-{
-    if (!m_valid.exchange(false)) {
-        throw clerror("MemoryMap.release", CL_INVALID_VALUE,
-                      "trying to double-unref mem map");
-    }
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    queue = queue ? queue : &m_queue;
-    pyopencl_call_guarded(clEnqueueUnmapMemObject, queue,
-                          m_mem, PYOPENCL_CL_CASTABLE_THIS, wait_for, event_out(evt));
-}
-
-generic_info
-memory_map::get_info(cl_uint) const
-{
-    throw clerror("MemoryMap.get_info", CL_INVALID_VALUE);
-}
-
-intptr_t
-memory_map::intptr() const
-{
-    return m_valid ? (intptr_t)data() : 0;
-}
-
-memory_map*
-convert_memory_map(clobj_t evt, command_queue *queue,
-                   memory_object *buf, void *res)
-{
-    try {
-        return new memory_map(queue, buf, res);
-    } catch (...) {
-        delete evt;
-        pyopencl_call_guarded_cleanup(clEnqueueUnmapMemObject, queue,
-                                      buf, res, 0, nullptr, nullptr);
-        throw;
-    }
-}
-
-// c wrapper
-
-// Memory Map
-error*
-memory_map__release(clobj_t _map, clobj_t _queue, const clobj_t *_wait_for,
-                    uint32_t num_wait_for, clobj_t *evt)
-{
-    auto map = static_cast<memory_map*>(_map);
-    auto queue = static_cast<command_queue*>(_queue);
-    return c_handle_error([&] {
-            map->release(evt, queue, _wait_for, num_wait_for);
-        });
-}
-
-void*
-memory_map__data(clobj_t _map)
-{
-    return static_cast<memory_map*>(_map)->data();
-}
-
-error*
-enqueue_map_image(clobj_t *evt, clobj_t *map, clobj_t _queue, clobj_t _mem,
-                  cl_map_flags flags, const size_t *_orig, size_t orig_l,
-                  const size_t *_reg, size_t reg_l, size_t *row_pitch,
-                  size_t *slice_pitch, const clobj_t *_wait_for,
-                  uint32_t num_wait_for, int block)
-{
-    auto queue = static_cast<command_queue*>(_queue);
-    auto img = static_cast<image*>(_mem);
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    ConstBuffer<size_t, 3> orig(_orig, orig_l);
-    ConstBuffer<size_t, 3> reg(_reg, reg_l, 1);
-    return c_handle_retry_mem_error([&] {
-            void *res = pyopencl_call_guarded(
-                clEnqueueMapImage, queue, img, bool(block), flags, orig,
-                reg, row_pitch, slice_pitch, wait_for, event_out(evt));
-            *map = convert_memory_map(*evt, queue, img, res);
-        });
-}
-
-error*
-enqueue_map_buffer(clobj_t *evt, clobj_t *map, clobj_t _queue, clobj_t _mem,
-                   cl_map_flags flags, size_t offset, size_t size,
-                   const clobj_t *_wait_for, uint32_t num_wait_for,
-                   int block)
-{
-    auto queue = static_cast<command_queue*>(_queue);
-    auto buf = static_cast<buffer*>(_mem);
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    return c_handle_retry_mem_error([&] {
-            void *res = pyopencl_call_guarded(
-                clEnqueueMapBuffer, queue, buf, bool(block),
-                flags, offset, size, wait_for, event_out(evt));
-            *map = convert_memory_map(*evt, queue, buf, res);
-        });
-}
diff --git a/src/c_wrapper/memory_map.h b/src/c_wrapper/memory_map.h
deleted file mode 100644
index 65a988a9b0a462bcc9e145eade6d59363eb98279..0000000000000000000000000000000000000000
--- a/src/c_wrapper/memory_map.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#include "error.h"
-#include "command_queue.h"
-#include "memory_object.h"
-
-#ifndef __PYOPENCL_MEMORY_MAP_H
-#define __PYOPENCL_MEMORY_MAP_H
-
-class event;
-
-// {{{ memory_map
-
-extern template class clobj<void*>;
-extern template void print_arg<void*>(std::ostream&, void *const&, bool);
-extern template void print_buf<void*>(std::ostream&, void *const*,
-                                      size_t, ArgType, bool, bool);
-
-class memory_map : public clobj<void*> {
-private:
-    mutable volatile std::atomic_bool m_valid;
-    command_queue m_queue;
-    memory_object m_mem;
-public:
-    constexpr static const char *class_name = "MEMORY_MAP";
-    PYOPENCL_INLINE
-    memory_map(const command_queue *queue, const memory_object *mem, void *ptr)
-        : clobj(ptr), m_valid(true), m_queue(*queue), m_mem(*mem)
-    {}
-    ~memory_map();
-    void release(clobj_t *evt, const command_queue *queue,
-                 const clobj_t *wait_for, uint32_t num_wait_for) const;
-    generic_info get_info(cl_uint) const;
-    intptr_t intptr() const;
-};
-
-// }}}
-
-#endif
diff --git a/src/c_wrapper/memory_object.cpp b/src/c_wrapper/memory_object.cpp
deleted file mode 100644
index 6f1ba321307931e6194860b9db292aa2a2fa372a..0000000000000000000000000000000000000000
--- a/src/c_wrapper/memory_object.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-#include "memory_object.h"
-#include "context.h"
-#include "event.h"
-#include "command_queue.h"
-#include "clhelper.h"
-
-template class clobj<cl_mem>;
-template void print_arg<cl_mem>(std::ostream&, const cl_mem&, bool);
-template void print_buf<cl_mem>(std::ostream&, const cl_mem*,
-                                size_t, ArgType, bool, bool);
-
-generic_info
-memory_object::get_info(cl_uint param_name) const
-{
-    switch ((cl_mem_info)param_name) {
-    case CL_MEM_TYPE:
-        return pyopencl_get_int_info(cl_mem_object_type, MemObject,
-                                     PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_MEM_FLAGS:
-        return pyopencl_get_int_info(cl_mem_flags, MemObject,
-                                     PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_MEM_SIZE:
-        return pyopencl_get_int_info(size_t, MemObject, PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_MEM_HOST_PTR:
-        throw clerror("MemoryObject.get_info", CL_INVALID_VALUE,
-                      "Use MemoryObject.get_host_array to get "
-                      "host pointer.");
-    case CL_MEM_MAP_COUNT:
-    case CL_MEM_REFERENCE_COUNT:
-        return pyopencl_get_int_info(cl_uint, MemObject,
-                                     PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_MEM_CONTEXT:
-        return pyopencl_get_opaque_info(context, MemObject, PYOPENCL_CL_CASTABLE_THIS, param_name);
-
-#if PYOPENCL_CL_VERSION >= 0x1010
-        // TODO
-        //       case CL_MEM_ASSOCIATED_MEMOBJECT:
-        //      {
-        //        cl_mem param_value;
-        //        PYOPENCL_CALL_GUARDED(clGetMemObjectInfo, (this, param_name, sizeof(param_value), &param_value, 0));
-        //        if (param_value == 0)
-        //          {
-        //            // no associated memory object? no problem.
-        //            return py::object();
-        //          }
-
-        //        return create_mem_object_wrapper(param_value);
-        //      }
-    case CL_MEM_OFFSET:
-        return pyopencl_get_int_info(size_t, MemObject, PYOPENCL_CL_CASTABLE_THIS, param_name);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x2000
-    case CL_MEM_USES_SVM_POINTER:
-        return pyopencl_get_int_info(cl_bool, MemObject, PYOPENCL_CL_CASTABLE_THIS, param_name);
-#endif
-
-    default:
-        throw clerror("MemoryObject.get_info", CL_INVALID_VALUE);
-    }
-}
-
-memory_object::~memory_object()
-{
-    if (!m_valid.exchange(false))
-        return;
-    pyopencl_call_guarded_cleanup(clReleaseMemObject, PYOPENCL_CL_CASTABLE_THIS);
-}
-
-// c wrapper
-
-// Memory Object
-error*
-memory_object__release(clobj_t obj)
-{
-    return c_handle_error([&] {
-            static_cast<memory_object*>(obj)->release();
-        });
-}
-
-error*
-memory_object__get_host_array(clobj_t _obj, void **hostptr, size_t *size)
-{
-    auto obj = static_cast<memory_object*>(_obj);
-    return c_handle_error([&] {
-            cl_mem_flags flags;
-            pyopencl_call_guarded(clGetMemObjectInfo, obj, CL_MEM_FLAGS,
-                                  size_arg(flags), nullptr);
-            if (!(flags & CL_MEM_USE_HOST_PTR))
-                throw clerror("MemoryObject.get_host_array", CL_INVALID_VALUE,
-                              "Only MemoryObject with USE_HOST_PTR "
-                              "is supported.");
-            pyopencl_call_guarded(clGetMemObjectInfo, obj, CL_MEM_HOST_PTR,
-                                  size_arg(*hostptr), nullptr);
-            pyopencl_call_guarded(clGetMemObjectInfo, obj, CL_MEM_SIZE,
-                                  size_arg(*size), nullptr);
-        });
-}
-
-error*
-enqueue_migrate_mem_objects(clobj_t *evt, clobj_t _queue,
-                            const clobj_t *_mem_obj, uint32_t num_mem_obj,
-                            cl_mem_migration_flags flags,
-                            const clobj_t *_wait_for, uint32_t num_wait_for)
-{
-#if PYOPENCL_CL_VERSION >= 0x1020
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    const auto mem_obj = buf_from_class<memory_object>(_mem_obj, num_mem_obj);
-    auto queue = static_cast<command_queue*>(_queue);
-    return c_handle_retry_mem_error([&] {
-            pyopencl_call_guarded(clEnqueueMigrateMemObjects, queue,
-                                  mem_obj, flags, wait_for, event_out(evt));
-        });
-#else
-    PYOPENCL_UNSUPPORTED_BEFORE(clEnqueueMigrateMemObjects, "CL 1.2")
-#endif
-}
diff --git a/src/c_wrapper/memory_object.h b/src/c_wrapper/memory_object.h
deleted file mode 100644
index 635dc470ef1966d672799e7099d8452d52a56551..0000000000000000000000000000000000000000
--- a/src/c_wrapper/memory_object.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#include "error.h"
-#include <atomic>
-
-#ifndef __PYOPENCL_MEMORY_OBJECT_H
-#define __PYOPENCL_MEMORY_OBJECT_H
-
-// {{{ memory_object
-
-extern template class clobj<cl_mem>;
-extern template void print_arg<cl_mem>(std::ostream&, const cl_mem&, bool);
-extern template void print_buf<cl_mem>(std::ostream&, const cl_mem*,
-                                       size_t, ArgType, bool, bool);
-
-class memory_object : public clobj<cl_mem> {
-private:
-    mutable volatile std::atomic_bool m_valid;
-public:
-    constexpr static const char *class_name = "MEMORY_OBJECT";
-    PYOPENCL_INLINE
-    memory_object(cl_mem mem, bool retain)
-        : clobj(mem), m_valid(true)
-    {
-        if (retain) {
-            pyopencl_call_guarded(clRetainMemObject, PYOPENCL_CL_CASTABLE_THIS);
-        }
-    }
-    PYOPENCL_INLINE
-    memory_object(const memory_object &mem)
-        : memory_object(mem.data(), true)
-    {}
-    ~memory_object();
-    generic_info get_info(cl_uint param_name) const;
-    void
-    release() const
-    {
-        if (PYOPENCL_UNLIKELY(!m_valid.exchange(false))) {
-            throw clerror("MemoryObject.release", CL_INVALID_VALUE,
-                          "trying to double-unref mem object");
-        }
-        pyopencl_call_guarded(clReleaseMemObject, PYOPENCL_CL_CASTABLE_THIS);
-    }
-#if 0
-    PYOPENCL_USE_RESULT size_t
-    size() const
-    {
-        size_t param_value;
-        pyopencl_call_guarded(clGetMemObjectInfo, this, CL_MEM_SIZE,
-                              size_arg(param_value), nullptr);
-        return param_value;
-    }
-#endif
-};
-
-// }}}
-
-#endif
diff --git a/src/c_wrapper/mingw-std-threads b/src/c_wrapper/mingw-std-threads
deleted file mode 160000
index 776ce7faf9368ec9588ee77458799c281cb25737..0000000000000000000000000000000000000000
--- a/src/c_wrapper/mingw-std-threads
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 776ce7faf9368ec9588ee77458799c281cb25737
diff --git a/src/c_wrapper/platform.cpp b/src/c_wrapper/platform.cpp
deleted file mode 100644
index 21a896b207b56f0d155f6e730651912a9ea04226..0000000000000000000000000000000000000000
--- a/src/c_wrapper/platform.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-#include "platform.h"
-#include "device.h"
-#include "clhelper.h"
-
-#include <stdlib.h>
-
-template class clobj<cl_platform_id>;
-template void print_arg<cl_platform_id>(std::ostream&,
-                                        const cl_platform_id&, bool);
-template void print_clobj<platform>(std::ostream&, const platform*);
-template void print_buf<cl_platform_id>(std::ostream&, const cl_platform_id*,
-                                        size_t, ArgType, bool, bool);
-
-generic_info
-platform::get_info(cl_uint param_name) const
-{
-    switch ((cl_platform_info)param_name) {
-    case CL_PLATFORM_PROFILE:
-    case CL_PLATFORM_VERSION:
-    case CL_PLATFORM_NAME:
-    case CL_PLATFORM_VENDOR:
-#if !(defined(CL_PLATFORM_NVIDIA) && CL_PLATFORM_NVIDIA == 0x3001)
-    case CL_PLATFORM_EXTENSIONS:
-#endif
-        return pyopencl_get_str_info(Platform, PYOPENCL_CL_CASTABLE_THIS, param_name);
-    default:
-        throw clerror("Platform.get_info", CL_INVALID_VALUE);
-    }
-}
-
-void
-platform::get_version(cl_platform_id plat, int *major, int *minor)
-{
-    char s_buff[128];
-    size_t size;
-    pyopencl_buf<char> d_buff(0);
-    char *name = s_buff;
-    pyopencl_call_guarded(clGetPlatformInfo, plat, CL_PLATFORM_VERSION,
-                          0, nullptr, buf_arg(size));
-    if (PYOPENCL_UNLIKELY(size > sizeof(s_buff))) {
-        d_buff.resize(size);
-        name = d_buff.get();
-    }
-    pyopencl_call_guarded(clGetPlatformInfo, plat, CL_PLATFORM_VERSION,
-                          size_arg(name, size), buf_arg(size));
-    *major = *minor = -1;
-    sscanf(name, "OpenCL %d.%d", major, minor);
-    // Well, hopefully there won't be a negative OpenCL version =)
-    if (*major < 0 || *minor < 0) {
-        throw clerror("Platform.get_version", CL_INVALID_VALUE,
-                      "platform returned non-conformant "
-                      "platform version string");
-    }
-}
-
-// c wrapper
-
-error*
-get_platforms(clobj_t **_platforms, uint32_t *num_platforms)
-{
-    return c_handle_error([&] {
-            *num_platforms = 0;
-            pyopencl_call_guarded(clGetPlatformIDs, 0, nullptr,
-                                  buf_arg(*num_platforms));
-            pyopencl_buf<cl_platform_id> platforms(*num_platforms);
-            pyopencl_call_guarded(clGetPlatformIDs, platforms,
-                                  buf_arg(*num_platforms));
-            *_platforms = buf_to_base<platform>(platforms).release();
-        });
-}
-
-error*
-platform__get_devices(clobj_t _plat, clobj_t **_devices,
-                      uint32_t *num_devices, cl_device_type devtype)
-{
-    auto plat = static_cast<platform*>(_plat);
-    return c_handle_error([&] {
-            *num_devices = 0;
-            try {
-                pyopencl_call_guarded(clGetDeviceIDs, plat, devtype, 0, nullptr,
-                                      buf_arg(*num_devices));
-            } catch (const clerror &e) {
-                if (e.code() != CL_DEVICE_NOT_FOUND)
-                    throw e;
-                *num_devices = 0;
-            }
-            if (*num_devices == 0) {
-                *_devices = nullptr;
-                return;
-            }
-            pyopencl_buf<cl_device_id> devices(*num_devices);
-            pyopencl_call_guarded(clGetDeviceIDs, plat, devtype, devices,
-                                  buf_arg(*num_devices));
-            *_devices = buf_to_base<device>(devices).release();
-        });
-}
-
-error*
-platform__unload_compiler(clobj_t plat)
-{
-#if PYOPENCL_CL_VERSION >= 0x1020
-    return c_handle_error([&] {
-            pyopencl_call_guarded(clUnloadPlatformCompiler,
-                                  static_cast<platform*>(plat));
-        });
-#else
-    PYOPENCL_UNSUPPORTED(clUnloadPlatformCompiler, "CL 1.1 and below")
-#endif
-}
diff --git a/src/c_wrapper/platform.h b/src/c_wrapper/platform.h
deleted file mode 100644
index 1bad5c298aecb8c55b536276d78f3d8e63d400ea..0000000000000000000000000000000000000000
--- a/src/c_wrapper/platform.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#include "error.h"
-
-#ifndef __PYOPENCL_PLATFORM_H
-#define __PYOPENCL_PLATFORM_H
-
-// {{{ platform
-
-extern template class clobj<cl_platform_id>;
-extern template void print_arg<cl_platform_id>(std::ostream&,
-                                               const cl_platform_id&, bool);
-extern template void print_buf<cl_platform_id>(
-    std::ostream&, const cl_platform_id*, size_t, ArgType, bool, bool);
-
-class platform : public clobj<cl_platform_id> {
-public:
-    static void get_version(cl_platform_id plat, int *major, int *minor);
-    using clobj::clobj;
-    PYOPENCL_DEF_CL_CLASS(PLATFORM);
-
-    generic_info get_info(cl_uint param_name) const;
-};
-
-extern template void print_clobj<platform>(std::ostream&, const platform*);
-
-// }}}
-
-#endif
diff --git a/src/c_wrapper/program.cpp b/src/c_wrapper/program.cpp
deleted file mode 100644
index a0535c06a9d33abdf4cb91d93a87d6141bd7407a..0000000000000000000000000000000000000000
--- a/src/c_wrapper/program.cpp
+++ /dev/null
@@ -1,269 +0,0 @@
-#include "program.h"
-#include "device.h"
-#include "context.h"
-#include "clhelper.h"
-#include "kernel.h"
-
-template class clobj<cl_program>;
-template void print_arg<cl_program>(std::ostream&, const cl_program&, bool);
-template void print_clobj<program>(std::ostream&, const program*);
-template void print_buf<cl_program>(std::ostream&, const cl_program*,
-                                    size_t, ArgType, bool, bool);
-
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE program*
-new_program(cl_program prog, program_kind_type progkind=KND_UNKNOWN)
-{
-    return pyopencl_convert_obj(program, clReleaseProgram, prog, progkind);
-}
-
-program::~program()
-{
-    pyopencl_call_guarded_cleanup(clReleaseProgram, PYOPENCL_CL_CASTABLE_THIS);
-}
-
-generic_info
-program::get_info(cl_uint param) const
-{
-    switch ((cl_program_info)param) {
-    case CL_PROGRAM_CONTEXT:
-        return pyopencl_get_opaque_info(context, Program, PYOPENCL_CL_CASTABLE_THIS, param);
-    case CL_PROGRAM_REFERENCE_COUNT:
-    case CL_PROGRAM_NUM_DEVICES:
-        return pyopencl_get_int_info(cl_uint, Program, PYOPENCL_CL_CASTABLE_THIS, param);
-    case CL_PROGRAM_DEVICES:
-        return pyopencl_get_opaque_array_info(device, Program, PYOPENCL_CL_CASTABLE_THIS, param);
-    case CL_PROGRAM_SOURCE:
-        return pyopencl_get_str_info(Program, PYOPENCL_CL_CASTABLE_THIS, param);
-    case CL_PROGRAM_BINARY_SIZES:
-        return pyopencl_get_array_info(size_t, Program, PYOPENCL_CL_CASTABLE_THIS, param);
-    case CL_PROGRAM_BINARIES: {
-        auto sizes = pyopencl_get_vec_info(size_t, Program, PYOPENCL_CL_CASTABLE_THIS,
-                                           CL_PROGRAM_BINARY_SIZES);
-        pyopencl_buf<char*> result_ptrs(sizes.len());
-        for (size_t i  = 0;i < sizes.len();i++) {
-            result_ptrs[i] = (char*)malloc(sizes[i]);
-        }
-        try {
-            pyopencl_call_guarded(clGetProgramInfo, PYOPENCL_CL_CASTABLE_THIS, CL_PROGRAM_BINARIES,
-                                  sizes.len() * sizeof(char*),
-                                  result_ptrs.get(), nullptr);
-        } catch (...) {
-            for (size_t i  = 0;i < sizes.len();i++) {
-                free(result_ptrs[i]);
-            }
-        }
-        pyopencl_buf<generic_info> gis(sizes.len());
-        for (size_t i  = 0;i < sizes.len();i++) {
-            gis[i] = make_generic_info(
-                CLASS_NONE,
-                _copy_str(std::string("char[") + tostring(sizes[i]) + "]"),
-                true,
-                result_ptrs[i],
-                true);
-        }
-        return pyopencl_convert_array_info(generic_info, gis);
-    }
-
-#if PYOPENCL_CL_VERSION >= 0x1020
-    case CL_PROGRAM_NUM_KERNELS:
-        return pyopencl_get_int_info(size_t, Program, PYOPENCL_CL_CASTABLE_THIS, param);
-    case CL_PROGRAM_KERNEL_NAMES:
-        return pyopencl_get_str_info(Program, PYOPENCL_CL_CASTABLE_THIS, param);
-#endif
-    default:
-        throw clerror("Program.get_info", CL_INVALID_VALUE);
-    }
-}
-
-generic_info
-program::get_build_info(const device *dev, cl_program_build_info param) const
-{
-    switch (param) {
-    case CL_PROGRAM_BUILD_STATUS:
-        return pyopencl_get_int_info(cl_build_status, ProgramBuild,
-                                     PYOPENCL_CL_CASTABLE_THIS, dev, param);
-    case CL_PROGRAM_BUILD_OPTIONS:
-    case CL_PROGRAM_BUILD_LOG:
-        return pyopencl_get_str_info(ProgramBuild, PYOPENCL_CL_CASTABLE_THIS, dev, param);
-#if PYOPENCL_CL_VERSION >= 0x1020
-    case CL_PROGRAM_BINARY_TYPE:
-        return pyopencl_get_int_info(cl_program_binary_type, ProgramBuild,
-                                     PYOPENCL_CL_CASTABLE_THIS, dev, param);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x2000
-    case CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE:
-        return pyopencl_get_int_info(size_t, ProgramBuild,
-                                     PYOPENCL_CL_CASTABLE_THIS, dev, param);
-#endif
-    default:
-        throw clerror("Program.get_build_info", CL_INVALID_VALUE);
-    }
-}
-
-#if PYOPENCL_CL_VERSION >= 0x1020
-void
-program::compile(const char *opts, const clobj_t *_devs, size_t num_devs,
-                 const clobj_t *_prgs, const char *const *names,
-                 size_t num_hdrs)
-{
-    const auto devs = buf_from_class<device>(_devs, num_devs);
-    const auto prgs = buf_from_class<program>(_prgs, num_hdrs);
-    pyopencl_call_guarded(clCompileProgram, PYOPENCL_CL_CASTABLE_THIS, devs, opts, prgs,
-                          buf_arg(names, num_hdrs), nullptr, nullptr);
-}
-#endif
-
-pyopencl_buf<clobj_t>
-program::all_kernels()
-{
-    cl_uint num_knls;
-    pyopencl_call_guarded(clCreateKernelsInProgram, PYOPENCL_CL_CASTABLE_THIS, 0, nullptr,
-                          buf_arg(num_knls));
-    pyopencl_buf<cl_kernel> knls(num_knls);
-    pyopencl_call_guarded(clCreateKernelsInProgram, PYOPENCL_CL_CASTABLE_THIS, knls,
-                          buf_arg(num_knls));
-    return buf_to_base<kernel>(knls, true);
-}
-
-// c wrapper
-
-// Program
-error*
-create_program_with_source(clobj_t *prog, clobj_t _ctx, const char *_src)
-{
-    auto ctx = static_cast<context*>(_ctx);
-    return c_handle_error([&] {
-            const auto &src = _src;
-            const size_t length = strlen(src);
-            cl_program result = pyopencl_call_guarded(
-                clCreateProgramWithSource, ctx, len_arg(src), buf_arg(length));
-            *prog = new_program(result, KND_SOURCE);
-        });
-}
-
-error*
-create_program_with_il(clobj_t *prog, clobj_t _ctx, void *il, size_t length)
-{
-#if PYOPENCL_CL_VERSION >= 0x2010
-    auto ctx = static_cast<context*>(_ctx);
-    return c_handle_error([&] {
-            cl_program result = pyopencl_call_guarded(
-                clCreateProgramWithIL, ctx, il, length);
-            *prog = new_program(result, KND_SOURCE);
-        });
-#else
-    PYOPENCL_UNSUPPORTED_BEFORE(clCreateProgramWithIL, "CL 2.1")
-#endif
-}
-
-error*
-create_program_with_binary(clobj_t *prog, clobj_t _ctx,
-                           cl_uint num_devices, const clobj_t *devices,
-                           const unsigned char **binaries, size_t *binary_sizes)
-{
-    auto ctx = static_cast<context*>(_ctx);
-    const auto devs = buf_from_class<device>(devices, num_devices);
-    pyopencl_buf<cl_int> binary_statuses(num_devices);
-    return c_handle_error([&] {
-            cl_program result = pyopencl_call_guarded(
-                clCreateProgramWithBinary, ctx, devs,
-                binary_sizes, binaries, binary_statuses.get());
-            // for (cl_uint i = 0; i < num_devices; ++i)
-            //   std::cout << i << ":" << binary_statuses[i] << std::endl;
-            *prog = new_program(result, KND_BINARY);
-        });
-}
-
-error*
-program__build(clobj_t _prog, const char *options,
-               cl_uint num_devices, const clobj_t *_devices)
-{
-    auto prog = static_cast<const program*>(_prog);
-    const auto devices = buf_from_class<device>(_devices, num_devices);
-    return c_handle_error([&] {
-            pyopencl_call_guarded(clBuildProgram, prog, devices, options,
-                                  nullptr, nullptr);
-        });
-}
-
-error*
-program__kind(clobj_t prog, int *kind)
-{
-    return c_handle_error([&] {
-            *kind = static_cast<program*>(prog)->kind();
-        });
-}
-
-error*
-program__get_build_info(clobj_t _prog, clobj_t _dev,
-                        cl_program_build_info param, generic_info *out)
-{
-    auto prog = static_cast<program*>(_prog);
-    auto dev = static_cast<device*>(_dev);
-    return c_handle_error([&] {
-            *out = prog->get_build_info(dev, param);
-        });
-}
-
-error*
-program__create_with_builtin_kernels(clobj_t *_prg, clobj_t _ctx,
-                                     const clobj_t *_devs, uint32_t num_devs,
-                                     const char *names)
-{
-#if PYOPENCL_CL_VERSION >= 0x1020
-    const auto devs = buf_from_class<device>(_devs, num_devs);
-    auto ctx = static_cast<context*>(_ctx);
-    return c_handle_error([&] {
-            auto prg = pyopencl_call_guarded(clCreateProgramWithBuiltInKernels,
-                                             ctx, devs, names);
-            *_prg = new_program(prg);
-        });
-#else
-    PYOPENCL_UNSUPPORTED(clCreateProgramWithBuiltInKernels, "CL 1.1 and below")
-#endif
-}
-
-error*
-program__compile(clobj_t _prg, const char *opts, const clobj_t *_devs,
-                 size_t num_devs, const clobj_t *_prgs,
-                 const char *const *names, size_t num_hdrs)
-{
-#if PYOPENCL_CL_VERSION >= 0x1020
-    auto prg = static_cast<program*>(_prg);
-    return c_handle_error([&] {
-            prg->compile(opts, _devs, num_devs, _prgs, names, num_hdrs);
-        });
-#else
-    PYOPENCL_UNSUPPORTED(clCompileProgram, "CL 1.1 and below")
-#endif
-}
-
-error*
-program__link(clobj_t *_prg, clobj_t _ctx, const clobj_t *_prgs,
-              size_t num_prgs, const char *opts, const clobj_t *_devs,
-              size_t num_devs)
-{
-#if PYOPENCL_CL_VERSION >= 0x1020
-    const auto devs = buf_from_class<device>(_devs, num_devs);
-    const auto prgs = buf_from_class<program>(_prgs, num_prgs);
-    auto ctx = static_cast<context*>(_ctx);
-    return c_handle_error([&] {
-            auto prg = pyopencl_call_guarded(clLinkProgram, ctx, devs, opts,
-                                             prgs, nullptr, nullptr);
-            *_prg = new_program(prg);
-        });
-#else
-    PYOPENCL_UNSUPPORTED(clLinkProgram, "CL 1.1 and below")
-#endif
-}
-
-error*
-program__all_kernels(clobj_t _prg, clobj_t **_knl, uint32_t *size)
-{
-    auto prg = static_cast<program*>(_prg);
-    return c_handle_error([&] {
-            auto knls = prg->all_kernels();
-            *size = knls.len();
-            *_knl = knls.release();
-        });
-}
diff --git a/src/c_wrapper/program.h b/src/c_wrapper/program.h
deleted file mode 100644
index 63d2fc760141bec68a8a1347e5300cb07ccda41b..0000000000000000000000000000000000000000
--- a/src/c_wrapper/program.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#include "clhelper.h"
-
-#ifndef __PYOPENCL_PROGRAM_H
-#define __PYOPENCL_PROGRAM_H
-
-class device;
-
-// {{{ program
-
-extern template class clobj<cl_program>;
-extern template void print_arg<cl_program>(std::ostream&,
-                                           const cl_program&, bool);
-extern template void print_buf<cl_program>(std::ostream&, const cl_program*,
-                                           size_t, ArgType, bool, bool);
-
-class program : public clobj<cl_program> {
-private:
-    program_kind_type m_program_kind;
-
-public:
-    PYOPENCL_DEF_CL_CLASS(PROGRAM);
-    PYOPENCL_INLINE
-    program(cl_program prog, bool retain,
-            program_kind_type progkind=KND_UNKNOWN)
-        : clobj(prog), m_program_kind(progkind)
-    {
-        if (retain) {
-            pyopencl_call_guarded(clRetainProgram, PYOPENCL_CL_CASTABLE_THIS);
-        }
-    }
-    ~program();
-    PYOPENCL_USE_RESULT PYOPENCL_INLINE program_kind_type
-    kind() const
-    {
-        return m_program_kind;
-    }
-    PYOPENCL_USE_RESULT pyopencl_buf<cl_device_id>
-    get_info__devices() const
-    {
-        return pyopencl_get_vec_info(cl_device_id, Program, PYOPENCL_CL_CASTABLE_THIS,
-                                     CL_PROGRAM_DEVICES);
-    }
-    generic_info get_info(cl_uint param_name) const;
-    PYOPENCL_USE_RESULT generic_info
-    get_build_info(const device *dev, cl_program_build_info param_name) const;
-#if PYOPENCL_CL_VERSION >= 0x1020
-    void compile(const char *opts, const clobj_t *_devs, size_t num_devs,
-                 const clobj_t *_prgs, const char *const *names,
-                 size_t num_hdrs);
-#endif
-    pyopencl_buf<clobj_t> all_kernels();
-};
-
-extern template void print_clobj<program>(std::ostream&, const program*);
-
-// }}}
-
-#endif
diff --git a/src/c_wrapper/pyhelper.cpp b/src/c_wrapper/pyhelper.cpp
deleted file mode 100644
index 7397d12b7ddc1238801ded0a269a768da8f37e5a..0000000000000000000000000000000000000000
--- a/src/c_wrapper/pyhelper.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "pyhelper.h"
-
-namespace py {
-WrapFunc<int()> gc;
-WrapFunc<void*(void*)> ref;
-WrapFunc<void(void*)> deref;
-WrapFunc<void(void*, cl_int)> call;
-}
-
-void
-set_py_funcs(int (*_gc)(), void *(*_ref)(void*), void (*_deref)(void*),
-             void (*_call)(void*, cl_int))
-{
-    py::gc = _gc;
-    py::ref = _ref;
-    py::deref = _deref;
-    py::call = _call;
-}
diff --git a/src/c_wrapper/pyhelper.h b/src/c_wrapper/pyhelper.h
deleted file mode 100644
index 50c08402908aa92107c9ce0f09bad1e4002f8d35..0000000000000000000000000000000000000000
--- a/src/c_wrapper/pyhelper.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef __PYOPENCL_PYHELPER_H
-#define __PYOPENCL_PYHELPER_H
-
-#include "wrap_cl.h"
-#include "function.h"
-
-template<typename _Signature>
-class WrapFunc;
-
-template<typename Ret, typename... Args>
-class WrapFunc<Ret(Args...)> {
-    typedef Ret (*_FuncType)(Args...);
-    _FuncType m_func;
-    static PYOPENCL_INLINE _FuncType
-    check_func(_FuncType f)
-    {
-        return f ? f : ([] (Args...) {return Ret();});
-    }
-public:
-    WrapFunc(_FuncType func=nullptr)
-        : m_func(check_func(func))
-    {}
-    Ret
-    operator()(Args... args)
-    {
-        return m_func(std::forward<Args>(args)...);
-    }
-    WrapFunc&
-    operator=(_FuncType func)
-    {
-        m_func = check_func(func);
-        return *this;
-    }
-};
-
-namespace py {
-extern WrapFunc<int()> gc;
-extern WrapFunc<void*(void*)> ref;
-extern WrapFunc<void(void*)> deref;
-extern WrapFunc<void(void*, cl_int)> call;
-}
-
-#endif
diff --git a/src/c_wrapper/sampler.cpp b/src/c_wrapper/sampler.cpp
deleted file mode 100644
index b373c7830f9b1398a7d904b3c153db6b7ad2ab6a..0000000000000000000000000000000000000000
--- a/src/c_wrapper/sampler.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-#include "sampler.h"
-#include "context.h"
-#include "clhelper.h"
-
-template class clobj<cl_sampler>;
-template void print_arg<cl_sampler>(std::ostream&, const cl_sampler&, bool);
-template void print_clobj<sampler>(std::ostream&, const sampler*);
-template void print_buf<cl_sampler>(std::ostream&, const cl_sampler*,
-                                    size_t, ArgType, bool, bool);
-
-sampler::~sampler()
-{
-    pyopencl_call_guarded_cleanup(clReleaseSampler, PYOPENCL_CL_CASTABLE_THIS);
-}
-
-generic_info
-sampler::get_info(cl_uint param_name) const
-{
-    switch ((cl_sampler_info)param_name) {
-    case CL_SAMPLER_REFERENCE_COUNT:
-        return pyopencl_get_int_info(cl_uint, Sampler, PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_SAMPLER_CONTEXT:
-        return pyopencl_get_opaque_info(context, Sampler, PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_SAMPLER_ADDRESSING_MODE:
-        return pyopencl_get_int_info(cl_addressing_mode, Sampler,
-                                     PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_SAMPLER_FILTER_MODE:
-        return pyopencl_get_int_info(cl_filter_mode, Sampler, PYOPENCL_CL_CASTABLE_THIS, param_name);
-    case CL_SAMPLER_NORMALIZED_COORDS:
-        return pyopencl_get_int_info(cl_bool, Sampler, PYOPENCL_CL_CASTABLE_THIS, param_name);
-
-#if PYOPENCL_CL_VERSION >= 0x2000
-    // TODO: MIP_FILTER_MODE, LOD_MIN, LOD_MAX
-#endif
-
-    default:
-        throw clerror("Sampler.get_info", CL_INVALID_VALUE);
-    }
-}
-
-// c wrapper
-
-// Sampler
-error*
-create_sampler(clobj_t *samp, clobj_t _ctx, int norm_coords,
-               cl_addressing_mode am, cl_filter_mode fm)
-{
-    auto ctx = static_cast<context*>(_ctx);
-    return c_handle_error([&] {
-            *samp = new sampler(pyopencl_call_guarded(clCreateSampler, ctx,
-                                                      norm_coords, am, fm),
-                                false);
-        });
-}
diff --git a/src/c_wrapper/sampler.h b/src/c_wrapper/sampler.h
deleted file mode 100644
index 404b82e57e552393dd2e2b005e54b491584ccd3e..0000000000000000000000000000000000000000
--- a/src/c_wrapper/sampler.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#include "error.h"
-
-#ifndef __PYOPENCL_SAMPLER_H
-#define __PYOPENCL_SAMPLER_H
-
-// {{{ sampler
-
-extern template class clobj<cl_sampler>;
-extern template void print_arg<cl_sampler>(std::ostream&,
-                                           const cl_sampler&, bool);
-extern template void print_buf<cl_sampler>(std::ostream&, const cl_sampler*,
-                                           size_t, ArgType, bool, bool);
-
-class sampler : public clobj<cl_sampler> {
-public:
-    PYOPENCL_DEF_CL_CLASS(SAMPLER);
-    PYOPENCL_INLINE
-    sampler(cl_sampler samp, bool retain)
-        : clobj(samp)
-    {
-        if (retain) {
-            pyopencl_call_guarded(clRetainSampler, PYOPENCL_CL_CASTABLE_THIS);
-        }
-    }
-    ~sampler();
-    generic_info get_info(cl_uint param_name) const;
-};
-
-extern template void print_clobj<sampler>(std::ostream&, const sampler*);
-
-// }}}
-
-#endif
diff --git a/src/c_wrapper/svm.cpp b/src/c_wrapper/svm.cpp
deleted file mode 100644
index 8452ec99953e3806b0b890220f20c5c46d71a875..0000000000000000000000000000000000000000
--- a/src/c_wrapper/svm.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-#include "context.h"
-#include "command_queue.h"
-#include "event.h"
-
-error*
-svm_alloc(
-    clobj_t _ctx, cl_mem_flags flags, size_t size, cl_uint alignment,
-    void **result)
-{
-#if PYOPENCL_CL_VERSION >= 0x2000
-    auto ctx = static_cast<context*>(_ctx);
-    return c_handle_retry_mem_error([&] {
-            *result = clSVMAlloc(ctx->data(), flags, size, alignment);
-            if (!*result)
-                throw clerror("clSVMalloc", CL_INVALID_VALUE,
-                    "(allocation failure, unspecified reason)");
-        });
-#else
-    PYOPENCL_UNSUPPORTED_BEFORE(clSVMAlloc, "CL 2.0")
-#endif
-}
-
-
-error*
-svm_free(clobj_t _ctx, void *svm_pointer)
-{
-#if PYOPENCL_CL_VERSION >= 0x2000
-    auto ctx = static_cast<context*>(_ctx);
-    // no error returns (?!)
-    clSVMFree(ctx->data(), svm_pointer);
-    return nullptr;
-#else
-    PYOPENCL_UNSUPPORTED_BEFORE(clSVMFree, "CL 2.0")
-#endif
-}
-
-
-error*
-enqueue_svm_free(
-    clobj_t *evt, clobj_t _queue,
-    cl_uint num_svm_pointers,
-    void *svm_pointers[],
-    const clobj_t *_wait_for, uint32_t num_wait_for)
-{
-#if PYOPENCL_CL_VERSION >= 0x2000
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    auto queue = static_cast<command_queue*>(_queue);
-    return c_handle_retry_mem_error([&] {
-        pyopencl_call_guarded(
-            clEnqueueSVMFree, queue,
-            num_svm_pointers, svm_pointers,
-            /* pfn_free_func*/ nullptr,
-            /* user_data */ nullptr,
-            wait_for, event_out(evt));
-        });
-#else
-    PYOPENCL_UNSUPPORTED_BEFORE(clEnqueueSVMFree, "CL 2.0")
-#endif
-}
-
-
-error*
-enqueue_svm_memcpy(
-    clobj_t *evt, clobj_t _queue,
-    cl_bool is_blocking,
-    void *dst_ptr, const void *src_ptr, size_t size,
-    const clobj_t *_wait_for, uint32_t num_wait_for, void *pyobj)
-{
-#if PYOPENCL_CL_VERSION >= 0x2000
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    auto queue = static_cast<command_queue*>(_queue);
-    return c_handle_retry_mem_error([&] {
-        pyopencl_call_guarded(
-            clEnqueueSVMMemcpy, queue,
-            is_blocking,
-            dst_ptr, src_ptr, size,
-            wait_for, nanny_event_out(evt, pyobj));
-        });
-#else
-    PYOPENCL_UNSUPPORTED_BEFORE(clEnqueueSVMMemcpy, "CL 2.0")
-#endif
-}
-
-
-error*
-enqueue_svm_memfill(
-    clobj_t *evt, clobj_t _queue,
-    void *svm_ptr,
-    const void *pattern, size_t pattern_size, size_t size,
-    const clobj_t *_wait_for, uint32_t num_wait_for)
-{
-#if PYOPENCL_CL_VERSION >= 0x2000
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    auto queue = static_cast<command_queue*>(_queue);
-    return c_handle_retry_mem_error([&] {
-        pyopencl_call_guarded(
-            clEnqueueSVMMemFill, queue,
-            svm_ptr,
-            pattern, pattern_size, size,
-            wait_for, event_out(evt));
-        });
-#else
-    PYOPENCL_UNSUPPORTED_BEFORE(clEnqueueSVMMemFill, "CL 2.0")
-#endif
-}
-
-
-error*
-enqueue_svm_map(
-    clobj_t *evt, clobj_t _queue,
-    cl_bool blocking_map, cl_map_flags map_flags,
-    void *svm_ptr, size_t size,
-    const clobj_t *_wait_for, uint32_t num_wait_for)
-{
-#if PYOPENCL_CL_VERSION >= 0x2000
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    auto queue = static_cast<command_queue*>(_queue);
-    return c_handle_retry_mem_error([&] {
-        pyopencl_call_guarded(
-            clEnqueueSVMMap, queue,
-            blocking_map, map_flags,
-            svm_ptr, size,
-            wait_for, event_out(evt));
-        });
-#else
-    PYOPENCL_UNSUPPORTED_BEFORE(clEnqueueSVMMap, "CL 2.0")
-#endif
-}
-
-
-error*
-enqueue_svm_unmap(
-    clobj_t *evt, clobj_t _queue,
-    void *svm_ptr,
-    const clobj_t *_wait_for, uint32_t num_wait_for)
-{
-#if PYOPENCL_CL_VERSION >= 0x2000
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    auto queue = static_cast<command_queue*>(_queue);
-    return c_handle_retry_mem_error([&] {
-        pyopencl_call_guarded(
-            clEnqueueSVMUnmap, queue,
-            svm_ptr,
-            wait_for, event_out(evt));
-        });
-#else
-    PYOPENCL_UNSUPPORTED_BEFORE(clEnqueueSVMUnmap, "CL 2.0")
-#endif
-}
-
-
-error*
-enqueue_svm_migrate_mem(
-    clobj_t *evt, clobj_t _queue,
-    cl_uint num_svm_pointers,
-    const void **svm_pointers,
-    const size_t *sizes,
-    cl_mem_migration_flags flags,
-    const clobj_t *_wait_for, uint32_t num_wait_for)
-{
-#if PYOPENCL_CL_VERSION >= 0x2010
-    const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for);
-    auto queue = static_cast<command_queue*>(_queue);
-    return c_handle_retry_mem_error([&] {
-        pyopencl_call_guarded(
-            clEnqueueSVMMigrateMem, queue,
-            num_svm_pointers, svm_pointers, sizes, flags,
-            wait_for, event_out(evt));
-        });
-#else
-    PYOPENCL_UNSUPPORTED_BEFORE(clEnqueueSVMMigrateMem, "CL 2.1")
-#endif
-}
diff --git a/src/c_wrapper/svm.h b/src/c_wrapper/svm.h
deleted file mode 100644
index c0e39ec47390d543e0ed8e943edaf10522842d33..0000000000000000000000000000000000000000
--- a/src/c_wrapper/svm.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __PYOPENCL_SVM_H
-#define __PYOPENCL_SVM_H
-
-#endif
diff --git a/src/c_wrapper/utils.h b/src/c_wrapper/utils.h
deleted file mode 100644
index d1bbb7d06f0779537bb61c953e2aae07e1e0e412..0000000000000000000000000000000000000000
--- a/src/c_wrapper/utils.h
+++ /dev/null
@@ -1,551 +0,0 @@
-#include "wrap_cl.h"
-#include "function.h"
-#include "debug.h"
-
-#include <string>
-#include <sstream>
-#include <string.h>
-#include <memory>
-
-#ifndef __PYOPENCL_UTILS_H
-#define __PYOPENCL_UTILS_H
-
-#if (defined(__GNUC__) && (__GNUC__ > 2))
-#  define PYOPENCL_EXPECT(exp, var) __builtin_expect(exp, var)
-#else
-#  define PYOPENCL_EXPECT(exp, var) (exp)
-#endif
-
-#define PYOPENCL_LIKELY(x) PYOPENCL_EXPECT(bool(x), true)
-#define PYOPENCL_UNLIKELY(x) PYOPENCL_EXPECT(bool(x), false)
-
-template<class T>
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE std::string
-tostring(const T& v)
-{
-    std::ostringstream ostr;
-    ostr << v;
-    return ostr.str();
-}
-
-template<typename T, class = void>
-struct CLGenericArgPrinter {
-    static PYOPENCL_INLINE void
-    print(std::ostream &stm, T &arg)
-    {
-        stm << arg;
-    }
-};
-
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE void*
-cl_memdup(const void *p, size_t size)
-{
-    void *res = malloc(size);
-    memcpy(res, p, size);
-    return res;
-}
-
-template<typename T>
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE T*
-cl_memdup(const T *p)
-{
-    // Not supported by libstdc++ yet...
-    // static_assert(std::is_trivially_copy_constructible<T>::value);
-    return static_cast<T*>(cl_memdup(static_cast<const void*>(p), sizeof(T)));
-}
-
-enum class ArgType {
-    None,
-    SizeOf,
-    Length,
-};
-
-template<typename T, class = void>
-struct type_size : std::integral_constant<size_t, sizeof(T)> {};
-template<typename T>
-struct type_size<T, enable_if_t<std::is_same<rm_const_t<T>, void>::value> > :
-        std::integral_constant<size_t, 1> {};
-
-template<typename T>
-static PYOPENCL_INLINE void
-_print_buf_content(std::ostream &stm, const T *p, size_t len)
-{
-    if (len > 1) {
-        stm << "[";
-    }
-    for (size_t i = 0;i < len;i++) {
-        CLGenericArgPrinter<const T>::print(stm, p[i]);
-        if (i != len - 1) {
-            stm << ", ";
-        }
-    }
-    if (len > 1) {
-        stm << "]";
-    }
-}
-
-template<>
-PYOPENCL_INLINE void
-_print_buf_content<char>(std::ostream &stm, const char *p, size_t len)
-{
-    dbg_print_str(stm, p, len);
-}
-
-template<>
-PYOPENCL_INLINE void
-_print_buf_content<unsigned char>(std::ostream &stm,
-                                  const unsigned char *p, size_t len)
-{
-    dbg_print_bytes(stm, p, len);
-}
-
-template<>
-PYOPENCL_INLINE void
-_print_buf_content<void>(std::ostream &stm, const void *p, size_t len)
-{
-    dbg_print_bytes(stm, static_cast<const unsigned char*>(p), len);
-}
-
-template<typename T>
-void
-print_buf(std::ostream &stm, const T *p, size_t len,
-          ArgType arg_type, bool content, bool out)
-{
-    const size_t ele_size = type_size<T>::value;
-    if (out) {
-        stm << "*(" << (const void*)p << "): ";
-        if (p) {
-            _print_buf_content(stm, p, len);
-        } else {
-            stm << "NULL";
-        }
-    } else {
-        bool need_quote = content || arg_type != ArgType::None;
-        if (content) {
-            if (p) {
-                _print_buf_content(stm, p, len);
-                stm << " ";
-            } else {
-                stm << "NULL ";
-            }
-        }
-        if (need_quote) {
-            stm << "<";
-        }
-        switch (arg_type) {
-        case ArgType::SizeOf:
-            stm << ele_size * len << ", ";
-            break;
-        case ArgType::Length:
-            stm << len << ", ";
-            break;
-        default:
-            break;
-        }
-        stm << (const void*)p;
-        if (need_quote) {
-            stm << ">";
-        }
-    }
-}
-
-template<typename T>
-void
-print_arg(std::ostream &stm, const T &v, bool out)
-{
-    if (!out) {
-        stm << (const void*)&v;
-    } else {
-        stm << "*(" << (const void*)&v << "): " << v;
-    }
-}
-extern template void print_buf<char>(std::ostream&, const char*, size_t,
-                                     ArgType, bool, bool);
-extern template void print_buf<cl_int>(std::ostream&, const cl_int*, size_t,
-                                       ArgType, bool, bool);
-extern template void print_buf<cl_uint>(std::ostream&, const cl_uint*, size_t,
-                                        ArgType, bool, bool);
-extern template void print_buf<cl_long>(std::ostream&, const cl_long*, size_t,
-                                        ArgType, bool, bool);
-extern template void print_buf<cl_ulong>(std::ostream&, const cl_ulong*, size_t,
-                                         ArgType, bool, bool);
-extern template void print_buf<cl_image_format>(std::ostream&,
-                                                const cl_image_format*, size_t,
-                                                ArgType, bool, bool);
-
-template<>
-struct CLGenericArgPrinter<std::nullptr_t, void> {
-    static PYOPENCL_INLINE void
-    print(std::ostream &stm, std::nullptr_t&)
-    {
-        stm << (void*)nullptr;
-    }
-};
-
-template<typename T>
-struct CLGenericArgPrinter<
-    T, enable_if_t<std::is_same<const char*, rm_const_t<T> >::value ||
-                   std::is_same<char*, rm_const_t<T> >::value> > {
-    static PYOPENCL_INLINE void
-    print(std::ostream &stm, const char *str)
-    {
-        dbg_print_str(stm, str);
-    }
-};
-
-template<typename T, class = void>
-class CLArg {
-private:
-    T &m_arg;
-public:
-    CLArg(T &arg) noexcept
-        : m_arg(arg)
-    {}
-    CLArg(CLArg &&other) noexcept
-        : m_arg(other.m_arg)
-    {}
-    PYOPENCL_INLINE T&
-    convert() noexcept
-    {
-        return m_arg;
-    }
-    PYOPENCL_INLINE void
-    print(std::ostream &stm)
-    {
-        CLGenericArgPrinter<T>::print(stm, m_arg);
-    }
-};
-
-template<>
-class CLArg<bool> : public CLArg<cl_bool> {
-    cl_bool m_arg;
-public:
-    CLArg(bool arg) noexcept
-        : CLArg<cl_bool>(m_arg), m_arg(arg ? CL_TRUE : CL_FALSE)
-    {}
-    CLArg(CLArg<bool> &&other) noexcept
-        : CLArg<bool>(bool(other.m_arg))
-    {}
-    PYOPENCL_INLINE void
-    print(std::ostream &stm)
-    {
-        stm << (m_arg ? "true" : "false");
-    }
-};
-
-template<typename T, ArgType AT=ArgType::None>
-class ArgBuffer {
-private:
-    T *m_buf;
-    size_t m_len;
-protected:
-    PYOPENCL_INLINE void
-    set(T *buf) noexcept
-    {
-        m_buf = buf;
-    }
-public:
-    typedef T type;
-    constexpr static ArgType arg_type = AT;
-    ArgBuffer(T *buf, size_t l) noexcept
-        : m_buf(buf), m_len(l)
-    {}
-    ArgBuffer(ArgBuffer<T, AT> &&other) noexcept
-        : ArgBuffer(other.m_buf, other.m_len)
-    {}
-    PYOPENCL_INLINE rm_const_t<T>*
-    get() const noexcept
-    {
-        return const_cast<rm_const_t<T>*>(m_buf);
-    }
-    template<typename T2 = T>
-    PYOPENCL_INLINE T2&
-    operator[](int i) const
-    {
-        return m_buf[i];
-    }
-    PYOPENCL_INLINE size_t
-    len() const noexcept
-    {
-        return m_len;
-    }
-};
-
-template<ArgType AT, typename T, class = void>
-struct _ToArgBuffer {
-    static PYOPENCL_INLINE ArgBuffer<rm_ref_t<T>, AT>
-    convert(T &buf)
-    {
-        return ArgBuffer<rm_ref_t<T>, AT>(&buf, 1);
-    }
-};
-
-template<ArgType AT=ArgType::None, typename T>
-static PYOPENCL_INLINE auto
-buf_arg(T &&buf) -> decltype(_ToArgBuffer<AT, T>::convert(std::forward<T>(buf)))
-{
-    return _ToArgBuffer<AT, T>::convert(std::forward<T>(buf));
-}
-
-template<ArgType AT=ArgType::None, typename T>
-static PYOPENCL_INLINE ArgBuffer<T, AT>
-buf_arg(T *buf, size_t l)
-{
-    return ArgBuffer<T, AT>(buf, l);
-}
-
-template<typename... T>
-static PYOPENCL_INLINE auto
-size_arg(T&&... buf)
-    -> decltype(buf_arg<ArgType::SizeOf>(std::forward<T>(buf)...))
-{
-    return buf_arg<ArgType::SizeOf>(std::forward<T>(buf)...);
-}
-
-template<typename... T>
-static PYOPENCL_INLINE auto
-len_arg(T&&... buf)
-    -> decltype(buf_arg<ArgType::Length>(std::forward<T>(buf)...))
-{
-    return buf_arg<ArgType::Length>(std::forward<T>(buf)...);
-}
-
-template<typename Buff, class = void>
-struct _ArgBufferConverter;
-
-template<typename Buff>
-struct _ArgBufferConverter<Buff,
-                           enable_if_t<Buff::arg_type == ArgType::None> > {
-    static PYOPENCL_INLINE auto
-    convert(Buff &buff) -> decltype(buff.get())
-    {
-        return buff.get();
-    }
-};
-
-template<typename Buff>
-struct _ArgBufferConverter<Buff,
-                           enable_if_t<Buff::arg_type == ArgType::SizeOf> > {
-    static PYOPENCL_INLINE auto
-    convert(Buff &buff)
-        -> decltype(std::make_tuple(type_size<typename Buff::type>::value *
-                                    buff.len(), buff.get()))
-    {
-        return std::make_tuple(type_size<typename Buff::type>::value *
-                               buff.len(), buff.get());
-    }
-};
-
-template<typename Buff>
-struct _ArgBufferConverter<Buff,
-                           enable_if_t<Buff::arg_type == ArgType::Length> > {
-    static PYOPENCL_INLINE auto
-    convert(Buff &buff) -> decltype(std::make_tuple(buff.len(), buff.get()))
-    {
-        return std::make_tuple(buff.len(), buff.get());
-    }
-};
-
-template<typename Buff>
-class CLArg<Buff, enable_if_t<std::is_base_of<ArgBuffer<typename Buff::type,
-                                                        Buff::arg_type>,
-                                              Buff>::value> > {
-private:
-    Buff &m_buff;
-public:
-    constexpr static bool is_out = !std::is_const<typename Buff::type>::value;
-    CLArg(Buff &buff) noexcept
-        : m_buff(buff)
-    {}
-    CLArg(CLArg<Buff> &&other) noexcept
-        : m_buff(other.m_buff)
-    {}
-    PYOPENCL_INLINE auto
-    convert() const noexcept
-        -> decltype(_ArgBufferConverter<Buff>::convert(m_buff))
-    {
-        return _ArgBufferConverter<Buff>::convert(m_buff);
-    }
-    PYOPENCL_INLINE void
-    print(std::ostream &stm, bool out=false)
-    {
-        print_buf(stm, m_buff.get(), m_buff.len(),
-                  Buff::arg_type, out || !is_out, out);
-    }
-};
-
-template<typename T, size_t n, ArgType AT=ArgType::None>
-class ConstBuffer : public ArgBuffer<const T, AT> {
-private:
-    T m_intern_buf[n];
-    ConstBuffer(ConstBuffer<T, n, AT>&&) = delete;
-    ConstBuffer() = delete;
-public:
-    ConstBuffer(const T *buf, size_t l, T content=0)
-        : ArgBuffer<const T, AT>(buf, n)
-    {
-        if (l < n) {
-            memcpy(m_intern_buf, buf, type_size<T>::value * l);
-            for (size_t i = l;i < n;i++) {
-                m_intern_buf[i] = content;
-            }
-            this->set(m_intern_buf);
-        }
-    }
-};
-
-struct OutArg {
-};
-
-template<typename T>
-class CLArg<T, enable_if_t<std::is_base_of<OutArg, T>::value> > {
-private:
-    bool m_converted;
-    bool m_need_cleanup;
-    T &m_arg;
-public:
-    constexpr static bool is_out = true;
-    CLArg(T &arg)
-        : m_converted(false), m_need_cleanup(false), m_arg(arg)
-    {
-    }
-    CLArg(CLArg<T> &&other) noexcept
-        : m_converted(other.m_converted), m_need_cleanup(other.m_need_cleanup),
-        m_arg(other.m_arg)
-    {
-        other.m_need_cleanup = false;
-    }
-    PYOPENCL_INLINE auto
-    convert() -> decltype(m_arg.get())
-    {
-        return m_arg.get();
-    }
-    PYOPENCL_INLINE void
-    finish(bool converted) noexcept
-    {
-        m_need_cleanup = !converted;
-    }
-    PYOPENCL_INLINE void
-    post()
-    {
-        m_arg.convert();
-        m_converted = true;
-    }
-    ~CLArg()
-    {
-        if (m_need_cleanup) {
-            m_arg.cleanup(m_converted);
-        }
-    }
-    PYOPENCL_INLINE void
-    print(std::ostream &stm, bool out=false)
-    {
-        m_arg.print(stm, out);
-    }
-};
-
-template<typename T>
-struct _D {
-    void operator()(T *p) {
-        free((void*)p);
-    }
-};
-
-template<typename T>
-class pyopencl_buf : public std::unique_ptr<T, _D<T> > {
-    size_t m_len;
-public:
-    PYOPENCL_INLINE
-    pyopencl_buf(size_t len=1)
-        : std::unique_ptr<T, _D<T> >((T*)(len ? malloc(sizeof(T) * (len + 1)) :
-                                          nullptr)), m_len(len)
-    {
-        if (len) {
-            memset((void*)this->get(), 0, (len + 1) * sizeof(T));
-        }
-    }
-    PYOPENCL_INLINE size_t
-    len() const
-    {
-        return m_len;
-    }
-    PYOPENCL_INLINE T&
-    operator[](int i)
-    {
-        return this->get()[i];
-    }
-    PYOPENCL_INLINE const T&
-    operator[](int i) const
-    {
-        return this->get()[i];
-    }
-    PYOPENCL_INLINE void
-    resize(size_t len)
-    {
-        if (len == m_len)
-            return;
-        m_len = len;
-        this->reset((T*)realloc((void*)this->release(),
-                                (len + 1) * sizeof(T)));
-    }
-};
-
-template<typename T>
-using pyopencl_buf_ele_t = typename rm_ref_t<T>::element_type;
-
-template<typename T, class = void>
-struct is_pyopencl_buf : std::false_type {};
-
-template<typename T>
-struct is_pyopencl_buf<
-    T, enable_if_t<std::is_base_of<pyopencl_buf<pyopencl_buf_ele_t<T> >,
-                                   rm_ref_t<T> >::value> > : std::true_type {};
-
-template<ArgType AT, typename T>
-struct _ToArgBuffer<AT, T, enable_if_t<is_pyopencl_buf<T>::value &&
-                                       std::is_const<rm_ref_t<T> >::value> > {
-    static PYOPENCL_INLINE ArgBuffer<const pyopencl_buf_ele_t<T>, AT>
-    convert(T &&buf)
-    {
-        return ArgBuffer<const pyopencl_buf_ele_t<T>, AT>(buf.get(), buf.len());
-    }
-};
-
-template<ArgType AT, typename T>
-struct _ToArgBuffer<AT, T, enable_if_t<is_pyopencl_buf<T>::value &&
-                                       !std::is_const<rm_ref_t<T> >::value> > {
-    static PYOPENCL_INLINE ArgBuffer<pyopencl_buf_ele_t<T>, AT>
-    convert(T &&buf)
-    {
-        return ArgBuffer<pyopencl_buf_ele_t<T>, AT>(buf.get(), buf.len());
-    }
-};
-
-template<typename Buff>
-using __pyopencl_buf_arg_type =
-    rm_ref_t<decltype(len_arg(std::declval<Buff&>()))>;
-
-template<typename Buff>
-class CLArg<Buff, enable_if_t<is_pyopencl_buf<Buff>::value> >
-    : public CLArg<__pyopencl_buf_arg_type<Buff> > {
-    typedef __pyopencl_buf_arg_type<Buff> BufType;
-    BufType m_buff;
-public:
-    PYOPENCL_INLINE
-    CLArg(Buff &buff) noexcept
-        : CLArg<BufType>(m_buff), m_buff(len_arg(buff))
-    {}
-    PYOPENCL_INLINE
-    CLArg(CLArg<Buff> &&other) noexcept
-        : CLArg<BufType>(m_buff), m_buff(std::move(other.m_buff))
-    {}
-};
-
-// FIXME
-PYOPENCL_USE_RESULT static PYOPENCL_INLINE char*
-_copy_str(const std::string& str)
-{
-    return strdup(str.c_str());
-}
-
-#endif
diff --git a/src/c_wrapper/wrap_cl.cpp b/src/c_wrapper/wrap_cl.cpp
deleted file mode 100644
index 1e001eb4eeb938f0b8d3656672084889095b6990..0000000000000000000000000000000000000000
--- a/src/c_wrapper/wrap_cl.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-#include "pyhelper.h"
-#include "clhelper.h"
-#include "platform.h"
-#include "device.h"
-#include "context.h"
-#include "command_queue.h"
-#include "event.h"
-#include "memory_object.h"
-#include "image.h"
-#include "gl_obj.h"
-#include "memory_map.h"
-#include "buffer.h"
-#include "sampler.h"
-#include "program.h"
-#include "kernel.h"
-
-template void print_buf<char>(std::ostream&, const char*, size_t,
-                              ArgType, bool, bool);
-template void print_buf<cl_int>(std::ostream&, const cl_int*, size_t,
-                                ArgType, bool, bool);
-template void print_buf<cl_uint>(std::ostream&, const cl_uint*, size_t,
-                                 ArgType, bool, bool);
-template void print_buf<cl_long>(std::ostream&, const cl_long*, size_t,
-                                 ArgType, bool, bool);
-template void print_buf<cl_ulong>(std::ostream&, const cl_ulong*, size_t,
-                                  ArgType, bool, bool);
-template void print_buf<cl_image_format>(std::ostream&,
-                                         const cl_image_format*, size_t,
-                                         ArgType, bool, bool);
-
-// {{{ c wrapper
-
-// Generic functions
-int
-get_cl_version()
-{
-    return PYOPENCL_CL_VERSION;
-}
-
-void
-free_pointer(void *p)
-{
-    free(p);
-}
-
-void
-free_pointer_array(void **p, uint32_t size)
-{
-    for (uint32_t i = 0;i < size;i++) {
-        free(p[i]);
-    }
-}
-
-
-intptr_t
-clobj__int_ptr(clobj_t obj)
-{
-    return PYOPENCL_LIKELY(obj) ? obj->intptr() : 0l;
-}
-
-static PYOPENCL_INLINE clobj_t
-_from_int_ptr(intptr_t ptr, class_t class_, bool retain)
-{
-    switch(class_) {
-    case CLASS_PLATFORM:
-        return clobj_from_int_ptr<platform>(ptr, retain);
-    case CLASS_DEVICE:
-        return clobj_from_int_ptr<device>(ptr, retain);
-    case CLASS_KERNEL:
-        return clobj_from_int_ptr<kernel>(ptr, retain);
-    case CLASS_CONTEXT:
-        return clobj_from_int_ptr<context>(ptr, retain);
-    case CLASS_COMMAND_QUEUE:
-        return clobj_from_int_ptr<command_queue>(ptr, retain);
-    case CLASS_BUFFER:
-        return clobj_from_int_ptr<buffer>(ptr, retain);
-    case CLASS_PROGRAM:
-        return clobj_from_int_ptr<program>(ptr, retain);
-    case CLASS_EVENT:
-        return clobj_from_int_ptr<event>(ptr, retain);
-    case CLASS_IMAGE:
-        return clobj_from_int_ptr<image>(ptr, retain);
-    case CLASS_SAMPLER:
-        return clobj_from_int_ptr<sampler>(ptr, retain);
-#ifdef HAVE_GL
-    case CLASS_GL_BUFFER:
-        return clobj_from_int_ptr<gl_buffer>(ptr, retain);
-    case CLASS_GL_RENDERBUFFER:
-        return clobj_from_int_ptr<gl_renderbuffer>(ptr, retain);
-#endif
-    default:
-        throw clerror("unknown class", CL_INVALID_VALUE);
-  }
-}
-
-error*
-clobj__from_int_ptr(clobj_t *out, intptr_t ptr, class_t class_, int retain)
-{
-    return c_handle_error([&] {
-            *out = _from_int_ptr(ptr, class_, retain);
-        });
-}
-
-error*
-clobj__get_info(clobj_t obj, cl_uint param, generic_info *out)
-{
-    return c_handle_error([&] {
-            if (PYOPENCL_UNLIKELY(!obj)) {
-                throw clerror("NULL input", CL_INVALID_VALUE);
-            }
-            *out = obj->get_info(param);
-        });
-}
-
-void
-clobj__delete(clobj_t obj)
-{
-    delete obj;
-}
-
-// }}}
-
-// vim: foldmethod=marker
diff --git a/src/c_wrapper/wrap_cl.h b/src/c_wrapper/wrap_cl.h
deleted file mode 100644
index 21ff9c086805056e701186adf00070ed1eee48ed..0000000000000000000000000000000000000000
--- a/src/c_wrapper/wrap_cl.h
+++ /dev/null
@@ -1,171 +0,0 @@
-#ifndef _WRAP_CL_H
-#define _WRAP_CL_H
-
-
-// CL 1.2 undecided:
-// clSetPrintfCallback
-
-// {{{ includes
-
-#include <stdint.h>
-
-#include "pyopencl_ext.h"
-
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-
-#if (defined(__APPLE__) && !defined(PYOPENCL_APPLE_USE_CL_H))
-
-// {{{ Mac
-
-#define PYOPENCL_HAVE_EVENT_SET_CALLBACK
-
-#ifdef HAVE_GL
-
-#define PYOPENCL_GL_SHARING_VERSION 1
-
-#include <OpenGL/OpenGL.h>
-#include <OpenCL/cl_gl.h>
-#include <OpenCL/cl_gl_ext.h>
-#endif
-// }}}
-
-#else
-
-// {{{ elsewhere
-
-#if defined(_WIN32)
-
-// {{{ Windows
-
-#define NOMINMAX
-#include <windows.h>
-#define strdup _strdup
-#define strcasecmp _stricmp
-
-#if _MSC_VER >= 1900 /* VS 2015 and higher */
-#define PYOPENCL_HAVE_EVENT_SET_CALLBACK
-#endif
-
-// }}}
-
-#else
-
-// {{{ non-Windows
-
-#include <unistd.h>
-#define PYOPENCL_HAVE_EVENT_SET_CALLBACK
-
-// }}}
-
-#endif
-
-#ifdef HAVE_GL
-#include <GL/gl.h>
-#include <CL/cl_gl.h>
-#endif
-
-#if defined(cl_khr_gl_sharing) && (cl_khr_gl_sharing >= 1)
-#define PYOPENCL_GL_SHARING_VERSION cl_khr_gl_sharing
-#endif
-
-// }}}
-
-#endif
-
-// }}}
-
-
-// {{{ version handling
-
-#ifdef PYOPENCL_PRETEND_CL_VERSION
-#define PYOPENCL_CL_VERSION PYOPENCL_PRETEND_CL_VERSION
-#else
-
-#if defined(CL_VERSION_2_2)
-#define PYOPENCL_CL_VERSION 0x2020
-#elif defined(CL_VERSION_2_1)
-#define PYOPENCL_CL_VERSION 0x2010
-#elif defined(CL_VERSION_2_0)
-#define PYOPENCL_CL_VERSION 0x2000
-#elif defined(CL_VERSION_1_2)
-#define PYOPENCL_CL_VERSION 0x1020
-#elif defined(CL_VERSION_1_1)
-#define PYOPENCL_CL_VERSION 0x1010
-#else
-#define PYOPENCL_CL_VERSION 0x1000
-#endif
-
-#endif
-
-// }}}
-
-#ifndef CL_VERSION_2_0
-typedef void* CLeglImageKHR;
-typedef void* CLeglDisplayKHR;
-typedef void* CLeglSyncKHR;
-typedef intptr_t cl_egl_image_properties_khr;
-typedef cl_bitfield         cl_device_svm_capabilities;
-typedef cl_bitfield         cl_svm_mem_flags;
-typedef intptr_t            cl_pipe_properties;
-typedef cl_uint             cl_pipe_info;
-typedef cl_bitfield         cl_sampler_properties;
-typedef cl_uint             cl_kernel_exec_info;
-#endif
-
-#ifndef CL_VERSION_1_2
-typedef intptr_t cl_device_partition_property;
-typedef cl_uint cl_kernel_arg_info;
-
-typedef struct _cl_image_desc {
-    cl_mem_object_type      image_type;
-    size_t                  image_width;
-    size_t                  image_height;
-    size_t                  image_depth;
-    size_t                  image_array_size;
-    size_t                  image_row_pitch;
-    size_t                  image_slice_pitch;
-    cl_uint                 num_mip_levels;
-    cl_uint                 num_samples;
-    cl_mem                  buffer;
-} cl_image_desc;
-
-typedef cl_bitfield cl_mem_migration_flags;
-#endif
-
-#ifndef CL_VERSION_1_1
-typedef struct _cl_buffer_region {
-    size_t                  origin;
-    size_t                  size;
-} cl_buffer_region;
-#endif
-
-#ifndef cl_ext_migrate_memobject
-typedef cl_bitfield cl_mem_migration_flags_ext;
-#endif
-
-struct clbase;
-typedef clbase *clobj_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "wrap_cl_core.h"
-
-#ifdef HAVE_GL
-#include "wrap_cl_gl_core.h"
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#if defined __GNUC__ || defined __GNUG__
-#define PYOPENCL_USE_RESULT __attribute__((warn_unused_result))
-#else
-#define PYOPENCL_USE_RESULT
-#endif
-
-#endif
-
-// vim: foldmethod=marker
diff --git a/src/c_wrapper/wrap_cl_core.h b/src/c_wrapper/wrap_cl_core.h
deleted file mode 100644
index 184cd001f5157661aa0a70732e06fc327694c32b..0000000000000000000000000000000000000000
--- a/src/c_wrapper/wrap_cl_core.h
+++ /dev/null
@@ -1,399 +0,0 @@
-// Interface between C and Python
-
-struct clbase;
-typedef struct clbase *clobj_t;
-
-// {{{ types
-
-typedef enum {
-    TYPE_FLOAT,
-    TYPE_INT,
-    TYPE_UINT,
-} type_t;
-
-typedef enum {
-    KND_UNKNOWN,
-    KND_SOURCE,
-    KND_BINARY
-} program_kind_type;
-
-typedef struct {
-    const char *routine;
-    const char *msg;
-    cl_int code;
-    int other;
-} error;
-
-typedef enum {
-    CLASS_NONE,
-    CLASS_PLATFORM,
-    CLASS_DEVICE,
-    CLASS_KERNEL,
-    CLASS_CONTEXT,
-    CLASS_BUFFER,
-    CLASS_PROGRAM,
-    CLASS_EVENT,
-    CLASS_COMMAND_QUEUE,
-    CLASS_GL_BUFFER,
-    CLASS_GL_RENDERBUFFER,
-    CLASS_IMAGE,
-    CLASS_SAMPLER
-} class_t;
-
-typedef struct {
-    class_t opaque_class;
-    const char *type;
-    bool free_type;
-    void *value;
-    bool free_value;
-} generic_info;
-
-// }}}
-
-// {{{ generic functions
-
-int get_cl_version();
-void free_pointer(void*);
-void free_pointer_array(void**, uint32_t size);
-void set_py_funcs(int (*_gc)(), void *(*_ref)(void*), void (*_deref)(void*),
-                  void (*_call)(void*, cl_int));
-int have_gl();
-
-unsigned bitlog2(unsigned long v);
-void populate_constants(void(*add)(const char*, const char*, int64_t value));
-int get_debug();
-void set_debug(int debug);
-
-// }}}
-
-// {{{ platform
-
-error *get_platforms(clobj_t **ptr_platforms, uint32_t *num_platforms);
-error *platform__get_devices(clobj_t platform, clobj_t **ptr_devices,
-                             uint32_t *num_devices, cl_device_type devtype);
-error *platform__unload_compiler(clobj_t plat);
-
-// }}}
-
-// {{{ device
-error *device__create_sub_devices(clobj_t _dev, clobj_t **_devs,
-                                  uint32_t *num_devices,
-                                  const cl_device_partition_property *props);
-
-// }}}
-
-// {{{ context
-
-error *create_context(clobj_t *ctx, const cl_context_properties *props,
-                      cl_uint num_devices, const clobj_t *ptr_devices);
-error *create_context_from_type(clobj_t *_ctx,
-                                const cl_context_properties *props,
-                                cl_device_type dev_type);
-error *context__get_supported_image_formats(clobj_t context, cl_mem_flags flags,
-                                            cl_mem_object_type image_type,
-                                            generic_info *out);
-
-// }}}
-
-// {{{ command Queue
-
-error *create_command_queue(clobj_t *queue, clobj_t context, clobj_t device,
-                            cl_command_queue_properties properties);
-error *command_queue__finish(clobj_t queue);
-error *command_queue__flush(clobj_t queue);
-
-// }}}
-
-// {{{ buffer
-error *create_buffer(clobj_t *buffer, clobj_t context, cl_mem_flags flags,
-                     size_t size, void *hostbuf);
-error *buffer__get_sub_region(clobj_t *_sub_buf, clobj_t _buf, size_t orig,
-                              size_t size, cl_mem_flags flags);
-
-// }}}
-
-// {{{ memory object
-
-error *memory_object__release(clobj_t obj);
-error *memory_object__get_host_array(clobj_t, void **hostptr, size_t *size);
-
-// }}}
-
-// {{{ memory map
-
-error *memory_map__release(clobj_t _map, clobj_t _queue,
-                           const clobj_t *_wait_for, uint32_t num_wait_for,
-                           clobj_t *evt);
-void *memory_map__data(clobj_t _map);
-
-// }}}
-
-// {{{ svm
-
-error* svm_alloc(
-    clobj_t _ctx, cl_mem_flags flags, size_t size, cl_uint alignment,
-    void **result);
-error* svm_free(clobj_t _ctx, void *svm_pointer);
-error* enqueue_svm_free(
-    clobj_t *evt, clobj_t _queue,
-    cl_uint num_svm_pointers,
-    void *svm_pointers[],
-    const clobj_t *_wait_for, uint32_t num_wait_for);
-error* enqueue_svm_memcpy(
-    clobj_t *evt, clobj_t _queue,
-    cl_bool is_blocking,
-    void *dst_ptr, const void *src_ptr, size_t size,
-    const clobj_t *_wait_for, uint32_t num_wait_for,
-    void *pyobj);
-error* enqueue_svm_memfill(
-    clobj_t *evt, clobj_t _queue,
-    void *svm_ptr,
-    const void *pattern, size_t pattern_size, size_t size,
-    const clobj_t *_wait_for, uint32_t num_wait_for);
-error* enqueue_svm_map(
-    clobj_t *evt, clobj_t _queue,
-    cl_bool blocking_map, cl_map_flags map_flags,
-    void *svm_ptr, size_t size,
-    const clobj_t *_wait_for, uint32_t num_wait_for);
-error* enqueue_svm_unmap(
-    clobj_t *evt, clobj_t _queue,
-    void *svm_ptr,
-    const clobj_t *_wait_for, uint32_t num_wait_for);
-error* enqueue_svm_migrate_mem(
-    clobj_t *evt, clobj_t _queue,
-    cl_uint num_svm_pointers,
-    const void **svm_pointers,
-    const size_t *sizes,
-    cl_mem_migration_flags flags,
-    const clobj_t *_wait_for, uint32_t num_wait_for);
-
-// }}}
-
-// {{{ program
-
-error *create_program_with_source(clobj_t *program, clobj_t context,
-                                  const char *src);
-error* create_program_with_il(clobj_t *prog, clobj_t _ctx, void *il, size_t length);
-error *create_program_with_binary(clobj_t *program, clobj_t context,
-                                  cl_uint num_devices, const clobj_t *devices,
-                                  const unsigned char **binaries,
-                                  size_t *binary_sizes);
-error *program__build(clobj_t program, const char *options,
-                      cl_uint num_devices, const clobj_t *devices);
-error *program__kind(clobj_t program, int *kind);
-error *program__get_build_info(clobj_t program, clobj_t device,
-                               cl_program_build_info param, generic_info *out);
-error *program__create_with_builtin_kernels(clobj_t *_prg, clobj_t _ctx,
-                                            const clobj_t *_devs,
-                                            uint32_t num_devs,
-                                            const char *names);
-error *program__compile(clobj_t _prg, const char *opts, const clobj_t *_devs,
-                        size_t num_devs, const clobj_t *_prgs,
-                        const char *const *names, size_t num_hdrs);
-error *program__link(clobj_t *_prg, clobj_t _ctx, const clobj_t *_prgs,
-                     size_t num_prgs, const char *opts,
-                     const clobj_t *_devs, size_t num_devs);
-error *program__all_kernels(clobj_t _prg, clobj_t **_knl, uint32_t *size);
-
-// }}}
-
-// {{{ sampler
-
-error *create_sampler(clobj_t *sampler, clobj_t context, int norm_coords,
-                      cl_addressing_mode am, cl_filter_mode fm);
-
-// }}}
-
-// {{{ kernel
-
-error *create_kernel(clobj_t *kernel, clobj_t program, const char *name);
-error *kernel__set_arg_null(clobj_t kernel, cl_uint arg_index);
-error *kernel__set_arg_mem(clobj_t kernel, cl_uint arg_index, clobj_t mem);
-error *kernel__set_arg_sampler(clobj_t kernel, cl_uint arg_index,
-                               clobj_t sampler);
-error *kernel__set_arg_buf(clobj_t kernel, cl_uint arg_index,
-                           const void *buffer, size_t size);
-error *kernel__set_arg_svm_pointer(clobj_t kernel, cl_uint arg_index, void *value);
-error *kernel__get_work_group_info(clobj_t kernel,
-                                   cl_kernel_work_group_info param,
-                                   clobj_t device, generic_info *out);
-error *kernel__get_arg_info(clobj_t _knl, cl_uint idx,
-                            cl_kernel_arg_info param, generic_info *out);
-
-// }}}
-
-// {{{ image
-error *create_image_2d(clobj_t *image, clobj_t context, cl_mem_flags flags,
-                       cl_image_format *fmt, size_t width, size_t height,
-                       size_t pitch, void *buffer);
-error *create_image_3d(clobj_t *image, clobj_t context, cl_mem_flags flags,
-                       cl_image_format *fmt, size_t width, size_t height,
-                       size_t depth, size_t pitch_x, size_t pitch_y,
-                       void *buffer);
-error *create_image_from_desc(clobj_t *img, clobj_t _ctx, cl_mem_flags flags,
-                              cl_image_format *fmt, cl_image_desc *desc,
-                              void *buffer);
-error *image__get_image_info(clobj_t img, cl_image_info param,
-                             generic_info *out);
-type_t image__get_fill_type(clobj_t img);
-// }}}
-
-// {{{ event
-
-error *event__get_profiling_info(clobj_t event, cl_profiling_info param,
-                                 generic_info *out);
-error *event__wait(clobj_t event);
-error *event__set_callback(clobj_t _evt, cl_int type, void *pyobj);
-error *wait_for_events(const clobj_t *_wait_for, uint32_t num_wait_for);
-
-// }}}
-
-// {{{ nanny event
-
-void *nanny_event__get_ward(clobj_t evt);
-
-// }}}
-
-// {{{ user event
-
-error *create_user_event(clobj_t *_evt, clobj_t _ctx);
-error *user_event__set_status(clobj_t _evt, cl_int status);
-
-// }}}
-
-// {{{ enqueue_*
-error *enqueue_nd_range_kernel(clobj_t *event, clobj_t queue,
-                               clobj_t kernel, cl_uint work_dim,
-                               const size_t *global_work_offset,
-                               const size_t *global_work_size,
-                               const size_t *local_work_size,
-                               const clobj_t *wait_for, uint32_t num_wait_for);
-error *enqueue_task(clobj_t *_evt, clobj_t _queue, clobj_t _knl,
-                    const clobj_t *_wait_for, uint32_t num_wait_for);
-
-error *enqueue_marker_with_wait_list(clobj_t *event, clobj_t queue,
-                                     const clobj_t *wait_for,
-                                     uint32_t num_wait_for);
-error *enqueue_barrier_with_wait_list(clobj_t *event, clobj_t queue,
-                                      const clobj_t *wait_for,
-                                      uint32_t num_wait_for);
-error *enqueue_wait_for_events(clobj_t _queue, const clobj_t *_wait_for,
-                               uint32_t num_wait_for);
-error *enqueue_marker(clobj_t *event, clobj_t queue);
-error *enqueue_barrier(clobj_t queue);
-error *enqueue_migrate_mem_objects(clobj_t *evt, clobj_t _queue,
-                                   const clobj_t *_mem_obj, uint32_t,
-                                   cl_mem_migration_flags flags,
-                                   const clobj_t *_wait_for, uint32_t num_wait_for);
-
-// }}}
-
-// {{{ enqueue_*_buffer*
-
-error *enqueue_read_buffer(clobj_t *event, clobj_t queue, clobj_t mem,
-                           void *buffer, size_t size, size_t device_offset,
-                           const clobj_t *wait_for, uint32_t num_wait_for,
-                           int is_blocking, void *pyobj);
-error *enqueue_copy_buffer(clobj_t *event, clobj_t queue, clobj_t src,
-                           clobj_t dst, ptrdiff_t byte_count,
-                           size_t src_offset, size_t dst_offset,
-                           const clobj_t *wait_for, uint32_t num_wait_for);
-error *enqueue_write_buffer(clobj_t *event, clobj_t queue, clobj_t mem,
-                            const void *buffer, size_t size,
-                            size_t device_offset, const clobj_t *wait_for,
-                            uint32_t num_wait_for, int is_blocking,
-                            void *pyobj);
-error *enqueue_map_buffer(clobj_t *_evt, clobj_t *mpa, clobj_t _queue,
-                          clobj_t _mem, cl_map_flags flags, size_t offset,
-                          size_t size, const clobj_t *_wait_for,
-                          uint32_t num_wait_for, int block);
-error *enqueue_fill_buffer(clobj_t *_evt, clobj_t _queue, clobj_t _mem,
-                           void *pattern, size_t psize, size_t offset,
-                           size_t size, const clobj_t *_wait_for,
-                           uint32_t num_wait_for);
-error *enqueue_read_buffer_rect(clobj_t *evt, clobj_t _queue, clobj_t _mem,
-                                void *buf, const size_t *_buf_orig,
-                                size_t buf_orig_l, const size_t *_host_orig,
-                                size_t host_orig_l, const size_t *_reg,
-                                size_t reg_l, const size_t *_buf_pitches,
-                                size_t buf_pitches_l,
-                                const size_t *_host_pitches,
-                                size_t host_pitches_l, const clobj_t *_wait_for,
-                                uint32_t num_wait_for, int block, void *pyobj);
-error *enqueue_write_buffer_rect(clobj_t *evt, clobj_t _queue, clobj_t _mem,
-                                 void *buf, const size_t *_buf_orig,
-                                 size_t buf_orig_l, const size_t *_host_orig,
-                                 size_t host_orig_l, const size_t *_reg,
-                                 size_t reg_l, const size_t *_buf_pitches,
-                                 size_t buf_pitches_l,
-                                 const size_t *_host_pitches,
-                                 size_t host_pitches_l,
-                                 const clobj_t *_wait_for,
-                                 uint32_t num_wait_for, int block, void *pyobj);
-error *enqueue_copy_buffer_rect(clobj_t *evt, clobj_t _queue, clobj_t _src,
-                                clobj_t _dst, const size_t *_src_orig,
-                                size_t src_orig_l, const size_t *_dst_orig,
-                                size_t dst_orig_l, const size_t *_reg,
-                                size_t reg_l, const size_t *_src_pitches,
-                                size_t src_pitches_l,
-                                const size_t *_dst_pitches,
-                                size_t dst_pitches_l, const clobj_t *_wait_for,
-                                uint32_t num_wait_for);
-
-// }}}
-
-// {{{ enqueue_*_image*
-
-error *enqueue_read_image(clobj_t *event, clobj_t queue, clobj_t mem,
-                          const size_t *origin, size_t origin_l,
-                          const size_t *region, size_t region_l,
-                          void *buffer, size_t row_pitch, size_t slice_pitch,
-                          const clobj_t *wait_for, uint32_t num_wait_for,
-                          int is_blocking, void *pyobj);
-error *enqueue_copy_image(clobj_t *_evt, clobj_t _queue, clobj_t _src,
-                          clobj_t _dst, const size_t *_src_origin,
-                          size_t src_origin_l, const size_t *_dst_origin,
-                          size_t dst_origin_l, const size_t *_region,
-                          size_t region_l, const clobj_t *_wait_for,
-                          uint32_t num_wait_for);
-error *enqueue_write_image(clobj_t *_evt, clobj_t _queue, clobj_t _mem,
-                           const size_t *origin, size_t origin_l,
-                           const size_t *region, size_t region_l,
-                           const void *buffer, size_t row_pitch,
-                           size_t slice_pitch, const clobj_t *_wait_for,
-                           uint32_t num_wait_for, int is_blocking,
-                           void *pyobj);
-error *enqueue_map_image(clobj_t *_evt, clobj_t *map, clobj_t _queue,
-                         clobj_t _mem, cl_map_flags flags,
-                         const size_t *_origin, size_t origin_l,
-                         const size_t *_region, size_t region_l,
-                         size_t *row_pitch, size_t *slice_pitch,
-                         const clobj_t *_wait_for, uint32_t num_wait_for,
-                         int block);
-error *enqueue_fill_image(clobj_t *evt, clobj_t _queue, clobj_t mem,
-                          const void *color, const size_t *_origin,
-                          size_t origin_l, const size_t *_region,
-                          size_t region_l, const clobj_t *_wait_for,
-                          uint32_t num_wait_for);
-error *enqueue_copy_image_to_buffer(clobj_t *evt, clobj_t _queue, clobj_t _src,
-                                    clobj_t _dst, const size_t *_orig, size_t,
-                                    const size_t *_reg, size_t, size_t offset,
-                                    const clobj_t *_wait_for, uint32_t);
-error *enqueue_copy_buffer_to_image(clobj_t *evt, clobj_t _queue, clobj_t _src,
-                                    clobj_t _dst, size_t offset,
-                                    const size_t *_orig, size_t,
-                                    const size_t *_reg, size_t,
-                                    const clobj_t *_wait_for, uint32_t);
-
-// }}}
-
-// {{{ cl object
-
-intptr_t clobj__int_ptr(clobj_t obj);
-error *clobj__get_info(clobj_t obj, cl_uint param, generic_info *out);
-void clobj__delete(clobj_t obj);
-error *clobj__from_int_ptr(clobj_t *out, intptr_t ptr, class_t, int);
-
-// }}}
-
-// vim: foldmethod=marker
diff --git a/src/c_wrapper/wrap_cl_gl_core.h b/src/c_wrapper/wrap_cl_gl_core.h
deleted file mode 100644
index 606d7c1d61c0f4006016f9724b76150c02857d79..0000000000000000000000000000000000000000
--- a/src/c_wrapper/wrap_cl_gl_core.h
+++ /dev/null
@@ -1,18 +0,0 @@
-// Interface between C and Python for GL related functions
-
-error* create_from_gl_texture(clobj_t *ptr, clobj_t _ctx, cl_mem_flags flags,
-                       GLenum texture_target, GLint miplevel,
-                       GLuint texture);
-error *create_from_gl_buffer(clobj_t *ptr, clobj_t context,
-                             cl_mem_flags flags, GLuint bufobj);
-error *create_from_gl_renderbuffer(clobj_t *ptr, clobj_t context,
-                                   cl_mem_flags flags, GLuint bufobj);
-error *enqueue_acquire_gl_objects(
-    clobj_t *event, clobj_t queue, const clobj_t *mem_objects,
-    uint32_t num_mem_objects, const clobj_t *wait_for, uint32_t num_wait_for);
-error *enqueue_release_gl_objects(
-    clobj_t *event, clobj_t queue, const clobj_t *mem_objects,
-    uint32_t num_mem_objects, const clobj_t *wait_for, uint32_t num_wait_for);
-cl_context_properties get_apple_cgl_share_group();
-error *get_gl_object_info(clobj_t mem, cl_gl_object_type *otype,
-                          GLuint *gl_name);
diff --git a/src/c_wrapper/wrap_constants.cpp b/src/c_wrapper/wrap_constants.cpp
deleted file mode 100644
index 08ed2edea0ca9e28c337127af31d26e819a93631..0000000000000000000000000000000000000000
--- a/src/c_wrapper/wrap_constants.cpp
+++ /dev/null
@@ -1,827 +0,0 @@
-#include "wrap_cl.h"
-#include <iostream>
-
-#ifdef CONST
-#undef CONST
-#endif
-
-extern "C"
-void populate_constants(void(*add)(const char*, const char*, int64_t value))
-{
-#define _ADD_ATTR(TYPE, PREFIX, NAME, SUFFIX, ...)      \
-      add(TYPE, #NAME, CL_##PREFIX##NAME##SUFFIX)
-#define ADD_ATTR(TYPE, PREFIX, NAME, ...)  \
-      _ADD_ATTR(TYPE, PREFIX, NAME, __VA_ARGS__)
-
-    // program_kind
-    add("program_kind", "UNKNOWN", KND_UNKNOWN);
-    add("program_kind", "SOURCE", KND_SOURCE);
-    add("program_kind", "BINARY", KND_BINARY);
-
-    // status_code
-    ADD_ATTR("status_code", , SUCCESS);
-    ADD_ATTR("status_code", , DEVICE_NOT_FOUND);
-    ADD_ATTR("status_code", , DEVICE_NOT_AVAILABLE);
-#if !(defined(CL_PLATFORM_NVIDIA) && CL_PLATFORM_NVIDIA == 0x3001)
-    ADD_ATTR("status_code", , COMPILER_NOT_AVAILABLE);
-#endif
-    ADD_ATTR("status_code", , MEM_OBJECT_ALLOCATION_FAILURE);
-    ADD_ATTR("status_code", , OUT_OF_RESOURCES);
-    ADD_ATTR("status_code", , OUT_OF_HOST_MEMORY);
-    ADD_ATTR("status_code", , PROFILING_INFO_NOT_AVAILABLE);
-    ADD_ATTR("status_code", , MEM_COPY_OVERLAP);
-    ADD_ATTR("status_code", , IMAGE_FORMAT_MISMATCH);
-    ADD_ATTR("status_code", , IMAGE_FORMAT_NOT_SUPPORTED);
-    ADD_ATTR("status_code", , BUILD_PROGRAM_FAILURE);
-    ADD_ATTR("status_code", , MAP_FAILURE);
-
-    ADD_ATTR("status_code", , INVALID_VALUE);
-    ADD_ATTR("status_code", , INVALID_DEVICE_TYPE);
-    ADD_ATTR("status_code", , INVALID_PLATFORM);
-    ADD_ATTR("status_code", , INVALID_DEVICE);
-    ADD_ATTR("status_code", , INVALID_CONTEXT);
-    ADD_ATTR("status_code", , INVALID_QUEUE_PROPERTIES);
-    ADD_ATTR("status_code", , INVALID_COMMAND_QUEUE);
-    ADD_ATTR("status_code", , INVALID_HOST_PTR);
-    ADD_ATTR("status_code", , INVALID_MEM_OBJECT);
-    ADD_ATTR("status_code", , INVALID_IMAGE_FORMAT_DESCRIPTOR);
-    ADD_ATTR("status_code", , INVALID_IMAGE_SIZE);
-    ADD_ATTR("status_code", , INVALID_SAMPLER);
-    ADD_ATTR("status_code", , INVALID_BINARY);
-    ADD_ATTR("status_code", , INVALID_BUILD_OPTIONS);
-    ADD_ATTR("status_code", , INVALID_PROGRAM);
-    ADD_ATTR("status_code", , INVALID_PROGRAM_EXECUTABLE);
-    ADD_ATTR("status_code", , INVALID_KERNEL_NAME);
-    ADD_ATTR("status_code", , INVALID_KERNEL_DEFINITION);
-    ADD_ATTR("status_code", , INVALID_KERNEL);
-    ADD_ATTR("status_code", , INVALID_ARG_INDEX);
-    ADD_ATTR("status_code", , INVALID_ARG_VALUE);
-    ADD_ATTR("status_code", , INVALID_ARG_SIZE);
-    ADD_ATTR("status_code", , INVALID_KERNEL_ARGS);
-    ADD_ATTR("status_code", , INVALID_WORK_DIMENSION);
-    ADD_ATTR("status_code", , INVALID_WORK_GROUP_SIZE);
-    ADD_ATTR("status_code", , INVALID_WORK_ITEM_SIZE);
-    ADD_ATTR("status_code", , INVALID_GLOBAL_OFFSET);
-    ADD_ATTR("status_code", , INVALID_EVENT_WAIT_LIST);
-    ADD_ATTR("status_code", , INVALID_EVENT);
-    ADD_ATTR("status_code", , INVALID_OPERATION);
-    ADD_ATTR("status_code", , INVALID_GL_OBJECT);
-    ADD_ATTR("status_code", , INVALID_BUFFER_SIZE);
-    ADD_ATTR("status_code", , INVALID_MIP_LEVEL);
-
-#if defined(cl_khr_icd) && (cl_khr_icd >= 1)
-    ADD_ATTR("status_code", , PLATFORM_NOT_FOUND_KHR);
-#endif
-
-#if defined(cl_khr_gl_sharing) && (cl_khr_gl_sharing >= 1)
-    ADD_ATTR("status_code", , INVALID_GL_SHAREGROUP_REFERENCE_KHR);
-#endif
-
-#if PYOPENCL_CL_VERSION >= 0x1010
-    ADD_ATTR("status_code", , MISALIGNED_SUB_BUFFER_OFFSET);
-    ADD_ATTR("status_code", , EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
-    ADD_ATTR("status_code", , INVALID_GLOBAL_WORK_SIZE);
-#endif
-
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("status_code", , COMPILE_PROGRAM_FAILURE);
-    ADD_ATTR("status_code", , LINKER_NOT_AVAILABLE);
-    ADD_ATTR("status_code", , LINK_PROGRAM_FAILURE);
-    ADD_ATTR("status_code", , DEVICE_PARTITION_FAILED);
-    ADD_ATTR("status_code", , KERNEL_ARG_INFO_NOT_AVAILABLE);
-    ADD_ATTR("status_code", , INVALID_IMAGE_DESCRIPTOR);
-    ADD_ATTR("status_code", , INVALID_COMPILER_OPTIONS);
-    ADD_ATTR("status_code", , INVALID_LINKER_OPTIONS);
-    ADD_ATTR("status_code", , INVALID_DEVICE_PARTITION_COUNT);
-#endif
-
-#if PYOPENCL_CL_VERSION >= 0x2000
-    ADD_ATTR("status_code", , INVALID_PIPE_SIZE);
-    ADD_ATTR("status_code", , INVALID_DEVICE_QUEUE);
-#endif
-
-    // platform_info
-    ADD_ATTR("platform_info", PLATFORM_, PROFILE);
-    ADD_ATTR("platform_info", PLATFORM_, VERSION);
-    ADD_ATTR("platform_info", PLATFORM_, NAME);
-    ADD_ATTR("platform_info", PLATFORM_, VENDOR);
-#if !(defined(CL_PLATFORM_NVIDIA) && CL_PLATFORM_NVIDIA == 0x3001)
-    ADD_ATTR("platform_info", PLATFORM_, EXTENSIONS);
-#endif
-
-
-    // device_type
-    ADD_ATTR("device_type", DEVICE_TYPE_, DEFAULT);
-    ADD_ATTR("device_type", DEVICE_TYPE_, CPU);
-    ADD_ATTR("device_type", DEVICE_TYPE_, GPU);
-    ADD_ATTR("device_type", DEVICE_TYPE_, ACCELERATOR);
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("device_type", DEVICE_TYPE_, CUSTOM);
-#endif
-    ADD_ATTR("device_type", DEVICE_TYPE_, ALL);
-
-
-    // device_info
-    ADD_ATTR("device_info", DEVICE_, TYPE);
-    ADD_ATTR("device_info", DEVICE_, VENDOR_ID);
-    ADD_ATTR("device_info", DEVICE_, MAX_COMPUTE_UNITS);
-    ADD_ATTR("device_info", DEVICE_, MAX_WORK_ITEM_DIMENSIONS);
-    ADD_ATTR("device_info", DEVICE_, MAX_WORK_GROUP_SIZE);
-    ADD_ATTR("device_info", DEVICE_, MAX_WORK_ITEM_SIZES);
-    ADD_ATTR("device_info", DEVICE_, PREFERRED_VECTOR_WIDTH_CHAR);
-    ADD_ATTR("device_info", DEVICE_, PREFERRED_VECTOR_WIDTH_SHORT);
-    ADD_ATTR("device_info", DEVICE_, PREFERRED_VECTOR_WIDTH_INT);
-    ADD_ATTR("device_info", DEVICE_, PREFERRED_VECTOR_WIDTH_LONG);
-    ADD_ATTR("device_info", DEVICE_, PREFERRED_VECTOR_WIDTH_FLOAT);
-    ADD_ATTR("device_info", DEVICE_, PREFERRED_VECTOR_WIDTH_DOUBLE);
-    ADD_ATTR("device_info", DEVICE_, MAX_CLOCK_FREQUENCY);
-    ADD_ATTR("device_info", DEVICE_, ADDRESS_BITS);
-    ADD_ATTR("device_info", DEVICE_, MAX_READ_IMAGE_ARGS);
-    ADD_ATTR("device_info", DEVICE_, MAX_WRITE_IMAGE_ARGS);
-    ADD_ATTR("device_info", DEVICE_, MAX_MEM_ALLOC_SIZE);
-    ADD_ATTR("device_info", DEVICE_, IMAGE2D_MAX_WIDTH);
-    ADD_ATTR("device_info", DEVICE_, IMAGE2D_MAX_HEIGHT);
-    ADD_ATTR("device_info", DEVICE_, IMAGE3D_MAX_WIDTH);
-    ADD_ATTR("device_info", DEVICE_, IMAGE3D_MAX_HEIGHT);
-    ADD_ATTR("device_info", DEVICE_, IMAGE3D_MAX_DEPTH);
-    ADD_ATTR("device_info", DEVICE_, IMAGE_SUPPORT);
-    ADD_ATTR("device_info", DEVICE_, MAX_PARAMETER_SIZE);
-    ADD_ATTR("device_info", DEVICE_, MAX_SAMPLERS);
-    ADD_ATTR("device_info", DEVICE_, MEM_BASE_ADDR_ALIGN);
-    ADD_ATTR("device_info", DEVICE_, MIN_DATA_TYPE_ALIGN_SIZE);
-    ADD_ATTR("device_info", DEVICE_, SINGLE_FP_CONFIG);
-#ifdef CL_DEVICE_DOUBLE_FP_CONFIG
-    ADD_ATTR("device_info", DEVICE_, DOUBLE_FP_CONFIG);
-#endif
-#ifdef CL_DEVICE_HALF_FP_CONFIG
-    ADD_ATTR("device_info", DEVICE_, HALF_FP_CONFIG);
-#endif
-    ADD_ATTR("device_info", DEVICE_, GLOBAL_MEM_CACHE_TYPE);
-    ADD_ATTR("device_info", DEVICE_, GLOBAL_MEM_CACHELINE_SIZE);
-    ADD_ATTR("device_info", DEVICE_, GLOBAL_MEM_CACHE_SIZE);
-    ADD_ATTR("device_info", DEVICE_, GLOBAL_MEM_SIZE);
-    ADD_ATTR("device_info", DEVICE_, MAX_CONSTANT_BUFFER_SIZE);
-    ADD_ATTR("device_info", DEVICE_, MAX_CONSTANT_ARGS);
-    ADD_ATTR("device_info", DEVICE_, LOCAL_MEM_TYPE);
-    ADD_ATTR("device_info", DEVICE_, LOCAL_MEM_SIZE);
-    ADD_ATTR("device_info", DEVICE_, ERROR_CORRECTION_SUPPORT);
-    ADD_ATTR("device_info", DEVICE_, PROFILING_TIMER_RESOLUTION);
-    ADD_ATTR("device_info", DEVICE_, ENDIAN_LITTLE);
-    ADD_ATTR("device_info", DEVICE_, AVAILABLE);
-    ADD_ATTR("device_info", DEVICE_, COMPILER_AVAILABLE);
-    ADD_ATTR("device_info", DEVICE_, EXECUTION_CAPABILITIES);
-    ADD_ATTR("device_info", DEVICE_, QUEUE_PROPERTIES);
-#if PYOPENCL_CL_VERSION >= 0x2000
-    ADD_ATTR("device_info", DEVICE_, QUEUE_ON_HOST_PROPERTIES);
-#endif
-    ADD_ATTR("device_info", DEVICE_, NAME);
-    ADD_ATTR("device_info", DEVICE_, VENDOR);
-    ADD_ATTR("device_info", , DRIVER_VERSION);
-    ADD_ATTR("device_info", DEVICE_, VERSION);
-    ADD_ATTR("device_info", DEVICE_, PROFILE);
-    ADD_ATTR("device_info", DEVICE_, EXTENSIONS);
-    ADD_ATTR("device_info", DEVICE_, PLATFORM);
-#if PYOPENCL_CL_VERSION >= 0x1010
-    ADD_ATTR("device_info", DEVICE_, PREFERRED_VECTOR_WIDTH_HALF);
-    ADD_ATTR("device_info", DEVICE_, HOST_UNIFIED_MEMORY); // deprecated in 2.0
-    ADD_ATTR("device_info", DEVICE_, NATIVE_VECTOR_WIDTH_CHAR);
-    ADD_ATTR("device_info", DEVICE_, NATIVE_VECTOR_WIDTH_SHORT);
-    ADD_ATTR("device_info", DEVICE_, NATIVE_VECTOR_WIDTH_INT);
-    ADD_ATTR("device_info", DEVICE_, NATIVE_VECTOR_WIDTH_LONG);
-    ADD_ATTR("device_info", DEVICE_, NATIVE_VECTOR_WIDTH_FLOAT);
-    ADD_ATTR("device_info", DEVICE_, NATIVE_VECTOR_WIDTH_DOUBLE);
-    ADD_ATTR("device_info", DEVICE_, NATIVE_VECTOR_WIDTH_HALF);
-    ADD_ATTR("device_info", DEVICE_, OPENCL_C_VERSION);
-#endif
-#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
-    ADD_ATTR("device_info", DEVICE_, COMPUTE_CAPABILITY_MAJOR_NV);
-    ADD_ATTR("device_info", DEVICE_, COMPUTE_CAPABILITY_MINOR_NV);
-    ADD_ATTR("device_info", DEVICE_, REGISTERS_PER_BLOCK_NV);
-    ADD_ATTR("device_info", DEVICE_, WARP_SIZE_NV);
-    ADD_ATTR("device_info", DEVICE_, GPU_OVERLAP_NV);
-    ADD_ATTR("device_info", DEVICE_, KERNEL_EXEC_TIMEOUT_NV);
-    ADD_ATTR("device_info", DEVICE_, INTEGRATED_MEMORY_NV);
-    // Nvidia specific device attributes, not defined in Khronos CL/cl_ext.h
-#ifdef CL_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT_NV
-    ADD_ATTR("device_info", DEVICE_, ATTRIBUTE_ASYNC_ENGINE_COUNT_NV);
-#endif
-#ifdef CL_DEVICE_PCI_BUS_ID_NV
-    ADD_ATTR("device_info", DEVICE_, PCI_BUS_ID_NV);
-#endif
-#ifdef CL_DEVICE_PCI_SLOT_ID_NV
-    ADD_ATTR("device_info", DEVICE_, PCI_SLOT_ID_NV);
-#endif
-#endif
-#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
-    ADD_ATTR("device_info", DEVICE_, PROFILING_TIMER_OFFSET_AMD);
-#endif
-#ifdef CL_DEVICE_TOPOLOGY_AMD
-    ADD_ATTR("device_info", DEVICE_, TOPOLOGY_AMD);
-#endif
-#ifdef CL_DEVICE_BOARD_NAME_AMD
-    ADD_ATTR("device_info", DEVICE_, BOARD_NAME_AMD);
-#endif
-#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
-    ADD_ATTR("device_info", DEVICE_, GLOBAL_FREE_MEMORY_AMD);
-#endif
-#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
-    ADD_ATTR("device_info", DEVICE_, SIMD_PER_COMPUTE_UNIT_AMD);
-#endif
-#ifdef CL_DEVICE_SIMD_WIDTH_AMD
-    ADD_ATTR("device_info", DEVICE_, SIMD_WIDTH_AMD);
-#endif
-#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
-    ADD_ATTR("device_info", DEVICE_, SIMD_INSTRUCTION_WIDTH_AMD);
-#endif
-#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
-    ADD_ATTR("device_info", DEVICE_, WAVEFRONT_WIDTH_AMD);
-#endif
-#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
-    ADD_ATTR("device_info", DEVICE_, GLOBAL_MEM_CHANNELS_AMD);
-#endif
-#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
-    ADD_ATTR("device_info", DEVICE_, GLOBAL_MEM_CHANNEL_BANKS_AMD);
-#endif
-#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
-    ADD_ATTR("device_info", DEVICE_, GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD);
-#endif
-#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
-    ADD_ATTR("device_info", DEVICE_, LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD);
-#endif
-#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
-    ADD_ATTR("device_info", DEVICE_, LOCAL_MEM_BANKS_AMD);
-#endif
-
-#ifdef CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD
-    ADD_ATTR("device_info", DEVICE_, THREAD_TRACE_SUPPORTED_AMD);
-#endif
-#ifdef CL_DEVICE_GFXIP_MAJOR_AMD
-    ADD_ATTR("device_info", DEVICE_, GFXIP_MAJOR_AMD);
-#endif
-#ifdef CL_DEVICE_GFXIP_MINOR_AMD
-    ADD_ATTR("device_info", DEVICE_, GFXIP_MINOR_AMD);
-#endif
-#ifdef CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD
-    ADD_ATTR("device_info", DEVICE_, AVAILABLE_ASYNC_QUEUES_AMD);
-#endif
-
-#ifdef CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT
-    ADD_ATTR("device_info", DEVICE_, MAX_ATOMIC_COUNTERS_EXT);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("device_info", DEVICE_, LINKER_AVAILABLE);
-    ADD_ATTR("device_info", DEVICE_, BUILT_IN_KERNELS);
-    ADD_ATTR("device_info", DEVICE_, IMAGE_MAX_BUFFER_SIZE);
-    ADD_ATTR("device_info", DEVICE_, IMAGE_MAX_ARRAY_SIZE);
-    ADD_ATTR("device_info", DEVICE_, PARENT_DEVICE);
-    ADD_ATTR("device_info", DEVICE_, PARTITION_MAX_SUB_DEVICES);
-    ADD_ATTR("device_info", DEVICE_, PARTITION_PROPERTIES);
-    ADD_ATTR("device_info", DEVICE_, PARTITION_AFFINITY_DOMAIN);
-    ADD_ATTR("device_info", DEVICE_, PARTITION_TYPE);
-    ADD_ATTR("device_info", DEVICE_, REFERENCE_COUNT);
-    ADD_ATTR("device_info", DEVICE_, PREFERRED_INTEROP_USER_SYNC);
-    ADD_ATTR("device_info", DEVICE_, PRINTF_BUFFER_SIZE);
-#endif
-#ifdef cl_khr_image2d_from_buffer
-    ADD_ATTR("device_info", DEVICE_, IMAGE_PITCH_ALIGNMENT);
-    ADD_ATTR("device_info", DEVICE_, IMAGE_BASE_ADDRESS_ALIGNMENT);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x2000
-    ADD_ATTR("device_info", DEVICE_, MAX_READ_WRITE_IMAGE_ARGS);
-    ADD_ATTR("device_info", DEVICE_, MAX_GLOBAL_VARIABLE_SIZE);
-    ADD_ATTR("device_info", DEVICE_, QUEUE_ON_DEVICE_PROPERTIES);
-    ADD_ATTR("device_info", DEVICE_, QUEUE_ON_DEVICE_PREFERRED_SIZE);
-    ADD_ATTR("device_info", DEVICE_, QUEUE_ON_DEVICE_MAX_SIZE);
-    ADD_ATTR("device_info", DEVICE_, MAX_ON_DEVICE_QUEUES);
-    ADD_ATTR("device_info", DEVICE_, MAX_ON_DEVICE_EVENTS);
-    ADD_ATTR("device_info", DEVICE_, SVM_CAPABILITIES);
-    ADD_ATTR("device_info", DEVICE_, GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE);
-    ADD_ATTR("device_info", DEVICE_, MAX_PIPE_ARGS);
-    ADD_ATTR("device_info", DEVICE_, PIPE_MAX_ACTIVE_RESERVATIONS);
-    ADD_ATTR("device_info", DEVICE_, PIPE_MAX_PACKET_SIZE);
-    ADD_ATTR("device_info", DEVICE_, PREFERRED_PLATFORM_ATOMIC_ALIGNMENT);
-    ADD_ATTR("device_info", DEVICE_, PREFERRED_GLOBAL_ATOMIC_ALIGNMENT);
-    ADD_ATTR("device_info", DEVICE_, PREFERRED_LOCAL_ATOMIC_ALIGNMENT);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x2010
-    ADD_ATTR("device_info", DEVICE_, IL_VERSION);
-    ADD_ATTR("device_info", DEVICE_, MAX_NUM_SUB_GROUPS);
-    ADD_ATTR("device_info", DEVICE_, SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS);
-#endif
-    /* cl_intel_advanced_motion_estimation */
-#ifdef CL_DEVICE_ME_VERSION_INTEL
-    ADD_ATTR("device_info", DEVICE_, ME_VERSION_INTEL);
-#endif
-
-    /* cl_qcom_ext_host_ptr */
-#ifdef CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM
-    ADD_ATTR("device_info", DEVICE_, EXT_MEM_PADDING_IN_BYTES_QCOM);
-#endif
-#ifdef CL_DEVICE_PAGE_SIZE_QCOM
-    ADD_ATTR("device_info", DEVICE_, PAGE_SIZE_QCOM);
-#endif
-
-    /* cl_khr_spir */
-#ifdef CL_DEVICE_SPIR_VERSIONS
-    ADD_ATTR("device_info", DEVICE_, SPIR_VERSIONS);
-#endif
-
-    /* cl_altera_device_temperature */
-#ifdef CL_DEVICE_CORE_TEMPERATURE_ALTERA
-    ADD_ATTR("device_info", DEVICE_, CORE_TEMPERATURE_ALTERA);
-#endif
-
-    /* cl_intel_simultaneous_sharing */
-#ifdef CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL
-    ADD_ATTR("device_info", DEVICE_, SIMULTANEOUS_INTEROPS_INTEL);
-#endif
-#ifdef CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL
-    ADD_ATTR("device_info", DEVICE_, NUM_SIMULTANEOUS_INTEROPS_INTEL);
-#endif
-
-    // device_fp_config
-    ADD_ATTR("device_fp_config", FP_, DENORM);
-    ADD_ATTR("device_fp_config", FP_, INF_NAN);
-    ADD_ATTR("device_fp_config", FP_, ROUND_TO_NEAREST);
-    ADD_ATTR("device_fp_config", FP_, ROUND_TO_ZERO);
-    ADD_ATTR("device_fp_config", FP_, ROUND_TO_INF);
-    ADD_ATTR("device_fp_config", FP_, FMA);
-#if PYOPENCL_CL_VERSION >= 0x1010
-    ADD_ATTR("device_fp_config", FP_, SOFT_FLOAT);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("device_fp_config", FP_, CORRECTLY_ROUNDED_DIVIDE_SQRT);
-#endif
-
-
-    // device_mem_cache_type
-    ADD_ATTR("device_mem_cache_type",  , NONE);
-    ADD_ATTR("device_mem_cache_type",  , READ_ONLY_CACHE);
-    ADD_ATTR("device_mem_cache_type",  , READ_WRITE_CACHE);
-
-
-    // device_local_mem_type
-    ADD_ATTR("device_local_mem_type",  , LOCAL);
-    ADD_ATTR("device_local_mem_type",  , GLOBAL);
-
-
-    // device_exec_capabilities
-    ADD_ATTR("device_exec_capabilities", EXEC_, KERNEL);
-    ADD_ATTR("device_exec_capabilities", EXEC_, NATIVE_KERNEL);
-#ifdef CL_EXEC_IMMEDIATE_EXECUTION_INTEL
-    ADD_ATTR("device_exec_capabilities", EXEC_, IMMEDIATE_EXECUTION_INTEL);
-#endif
-
-#if PYOPENCL_CL_VERSION >= 0x2000
-    // device_svm_capabilities
-    ADD_ATTR("device_svm_capabilities", DEVICE_SVM_, COARSE_GRAIN_BUFFER);
-    ADD_ATTR("device_svm_capabilities", DEVICE_SVM_, FINE_GRAIN_BUFFER);
-    ADD_ATTR("device_svm_capabilities", DEVICE_SVM_, FINE_GRAIN_SYSTEM);
-    ADD_ATTR("device_svm_capabilities", DEVICE_SVM_, ATOMICS);
-#endif
-
-
-    // command_queue_properties
-    ADD_ATTR("command_queue_properties", QUEUE_, OUT_OF_ORDER_EXEC_MODE_ENABLE);
-    ADD_ATTR("command_queue_properties", QUEUE_, PROFILING_ENABLE);
-#ifdef CL_QUEUE_IMMEDIATE_EXECUTION_ENABLE_INTEL
-    ADD_ATTR("command_queue_properties", QUEUE_, IMMEDIATE_EXECUTION_ENABLE_INTEL);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x2000
-    ADD_ATTR("command_queue_properties", QUEUE_, ON_DEVICE);
-    ADD_ATTR("command_queue_properties", QUEUE_, ON_DEVICE_DEFAULT);
-#endif
-
-
-    // context_info
-    ADD_ATTR("context_info", CONTEXT_, REFERENCE_COUNT);
-    ADD_ATTR("context_info", CONTEXT_, DEVICES);
-    ADD_ATTR("context_info", CONTEXT_, PROPERTIES);
-#if PYOPENCL_CL_VERSION >= 0x1010
-    ADD_ATTR("context_info", CONTEXT_, NUM_DEVICES);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("context_info", CONTEXT_, INTEROP_USER_SYNC);
-#endif
-
-
-    // gl_context_info
-#if defined(cl_khr_gl_sharing) && (cl_khr_gl_sharing >= 1)
-    ADD_ATTR("gl_context_info", , CURRENT_DEVICE_FOR_GL_CONTEXT_KHR);
-    ADD_ATTR("gl_context_info", , DEVICES_FOR_GL_CONTEXT_KHR);
-#endif
-
-
-    // context_properties
-    ADD_ATTR("context_properties", CONTEXT_, PLATFORM);
-#if defined(cl_khr_gl_sharing) && (cl_khr_gl_sharing >= 1)
-    ADD_ATTR("context_properties",  ,GL_CONTEXT_KHR);
-    ADD_ATTR("context_properties",  ,EGL_DISPLAY_KHR);
-    ADD_ATTR("context_properties",  ,GLX_DISPLAY_KHR);
-    ADD_ATTR("context_properties",  ,WGL_HDC_KHR);
-    ADD_ATTR("context_properties",  ,CGL_SHAREGROUP_KHR);
-#endif
-#if defined(__APPLE__) && defined(HAVE_GL) && !defined(PYOPENCL_APPLE_USE_CL_H)
-    ADD_ATTR("context_properties",  ,CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE);
-#endif /* __APPLE__ */
-#ifdef CL_CONTEXT_OFFLINE_DEVICES_AMD
-    ADD_ATTR("context_properties", CONTEXT_, OFFLINE_DEVICES_AMD);
-#endif
-
-
-    // command_queue_info
-    ADD_ATTR("command_queue_info", QUEUE_, CONTEXT);
-    ADD_ATTR("command_queue_info", QUEUE_, DEVICE);
-    ADD_ATTR("command_queue_info", QUEUE_, REFERENCE_COUNT);
-    ADD_ATTR("command_queue_info", QUEUE_, PROPERTIES);
-
-
-    // queue_properties
-#if PYOPENCL_CL_VERSION >= 0x2000
-    ADD_ATTR("queue_properties", QUEUE_, PROPERTIES);
-    ADD_ATTR("queue_properties", QUEUE_, SIZE);
-#endif
-
-
-    // mem_flags
-    ADD_ATTR("mem_flags", MEM_, READ_WRITE);
-    ADD_ATTR("mem_flags", MEM_, WRITE_ONLY);
-    ADD_ATTR("mem_flags", MEM_, READ_ONLY);
-    ADD_ATTR("mem_flags", MEM_, USE_HOST_PTR);
-    ADD_ATTR("mem_flags", MEM_, ALLOC_HOST_PTR);
-    ADD_ATTR("mem_flags", MEM_, COPY_HOST_PTR);
-#ifdef cl_amd_device_memory_flags
-    ADD_ATTR("mem_flags", MEM_, USE_PERSISTENT_MEM_AMD);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("mem_flags", MEM_, HOST_WRITE_ONLY);
-    ADD_ATTR("mem_flags", MEM_, HOST_READ_ONLY);
-    ADD_ATTR("mem_flags", MEM_, HOST_NO_ACCESS);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x2000
-    ADD_ATTR("mem_flags", MEM_, KERNEL_READ_AND_WRITE);
-#endif
-
-#if PYOPENCL_CL_VERSION >= 0x2000
-    ADD_ATTR("svm_mem_flags", MEM_, READ_WRITE);
-    ADD_ATTR("svm_mem_flags", MEM_, WRITE_ONLY);
-    ADD_ATTR("svm_mem_flags", MEM_, READ_ONLY);
-    ADD_ATTR("svm_mem_flags", MEM_, SVM_FINE_GRAIN_BUFFER);
-    ADD_ATTR("svm_mem_flags", MEM_, SVM_ATOMICS);
-#endif
-
-
-    // channel_order
-    ADD_ATTR("channel_order",  , R);
-    ADD_ATTR("channel_order",  , A);
-    ADD_ATTR("channel_order",  , RG);
-    ADD_ATTR("channel_order",  , RA);
-    ADD_ATTR("channel_order",  , RGB);
-    ADD_ATTR("channel_order",  , RGBA);
-    ADD_ATTR("channel_order",  , BGRA);
-    ADD_ATTR("channel_order",  , INTENSITY);
-    ADD_ATTR("channel_order",  , LUMINANCE);
-#if PYOPENCL_CL_VERSION >= 0x1010
-    ADD_ATTR("channel_order",  , Rx);
-    ADD_ATTR("channel_order",  , RGx);
-    ADD_ATTR("channel_order",  , RGBx);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x2000
-    ADD_ATTR("channel_order",  , sRGB);
-    ADD_ATTR("channel_order",  , sRGBx);
-    ADD_ATTR("channel_order",  , sRGBA);
-    ADD_ATTR("channel_order",  , sBGRA);
-    ADD_ATTR("channel_order",  , ABGR);
-#endif
-
-
-    // channel_type
-    ADD_ATTR("channel_type",  , SNORM_INT8);
-    ADD_ATTR("channel_type",  , SNORM_INT16);
-    ADD_ATTR("channel_type",  , UNORM_INT8);
-    ADD_ATTR("channel_type",  , UNORM_INT16);
-    ADD_ATTR("channel_type",  , UNORM_SHORT_565);
-    ADD_ATTR("channel_type",  , UNORM_SHORT_555);
-    ADD_ATTR("channel_type",  , UNORM_INT_101010);
-    ADD_ATTR("channel_type",  , SIGNED_INT8);
-    ADD_ATTR("channel_type",  , SIGNED_INT16);
-    ADD_ATTR("channel_type",  , SIGNED_INT32);
-    ADD_ATTR("channel_type",  , UNSIGNED_INT8);
-    ADD_ATTR("channel_type",  , UNSIGNED_INT16);
-    ADD_ATTR("channel_type",  , UNSIGNED_INT32);
-    ADD_ATTR("channel_type",  , HALF_FLOAT);
-    ADD_ATTR("channel_type",  , FLOAT);
-
-
-    // mem_object_type
-    ADD_ATTR("mem_object_type", MEM_OBJECT_, BUFFER);
-    ADD_ATTR("mem_object_type", MEM_OBJECT_, IMAGE2D);
-    ADD_ATTR("mem_object_type", MEM_OBJECT_, IMAGE3D);
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("mem_object_type", MEM_OBJECT_, IMAGE2D_ARRAY);
-    ADD_ATTR("mem_object_type", MEM_OBJECT_, IMAGE1D);
-    ADD_ATTR("mem_object_type", MEM_OBJECT_, IMAGE1D_ARRAY);
-    ADD_ATTR("mem_object_type", MEM_OBJECT_, IMAGE1D_BUFFER);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x2000
-    ADD_ATTR("mem_object_type", MEM_OBJECT_, PIPE);
-#endif
-
-
-    // mem_info
-    ADD_ATTR("mem_info", MEM_, TYPE);
-    ADD_ATTR("mem_info", MEM_, FLAGS);
-    ADD_ATTR("mem_info", MEM_, SIZE);
-    ADD_ATTR("mem_info", MEM_, HOST_PTR);
-    ADD_ATTR("mem_info", MEM_, MAP_COUNT);
-    ADD_ATTR("mem_info", MEM_, REFERENCE_COUNT);
-    ADD_ATTR("mem_info", MEM_, CONTEXT);
-#if PYOPENCL_CL_VERSION >= 0x1010
-    ADD_ATTR("mem_info", MEM_, ASSOCIATED_MEMOBJECT);
-    ADD_ATTR("mem_info", MEM_, OFFSET);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x2000
-    ADD_ATTR("mem_info", MEM_, USES_SVM_POINTER);
-#endif
-
-
-    // image_info
-    ADD_ATTR("image_info", IMAGE_, FORMAT);
-    ADD_ATTR("image_info", IMAGE_, ELEMENT_SIZE);
-    ADD_ATTR("image_info", IMAGE_, ROW_PITCH);
-    ADD_ATTR("image_info", IMAGE_, SLICE_PITCH);
-    ADD_ATTR("image_info", IMAGE_, WIDTH);
-    ADD_ATTR("image_info", IMAGE_, HEIGHT);
-    ADD_ATTR("image_info", IMAGE_, DEPTH);
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("image_info", IMAGE_, ARRAY_SIZE);
-    ADD_ATTR("image_info", IMAGE_, BUFFER);
-    ADD_ATTR("image_info", IMAGE_, NUM_MIP_LEVELS);
-    ADD_ATTR("image_info", IMAGE_, NUM_SAMPLES);
-#endif
-
-
-    // addressing_mode
-    ADD_ATTR("addressing_mode", ADDRESS_, NONE);
-    ADD_ATTR("addressing_mode", ADDRESS_, CLAMP_TO_EDGE);
-    ADD_ATTR("addressing_mode", ADDRESS_, CLAMP);
-    ADD_ATTR("addressing_mode", ADDRESS_, REPEAT);
-#if PYOPENCL_CL_VERSION >= 0x1010
-    ADD_ATTR("addressing_mode", ADDRESS_, MIRRORED_REPEAT);
-#endif
-
-
-    // filter_mode
-    ADD_ATTR("filter_mode", FILTER_, NEAREST);
-    ADD_ATTR("filter_mode", FILTER_, LINEAR);
-
-
-    // sampler_info
-    ADD_ATTR("sampler_info", SAMPLER_, REFERENCE_COUNT);
-    ADD_ATTR("sampler_info", SAMPLER_, CONTEXT);
-    ADD_ATTR("sampler_info", SAMPLER_, NORMALIZED_COORDS);
-    ADD_ATTR("sampler_info", SAMPLER_, ADDRESSING_MODE);
-    ADD_ATTR("sampler_info", SAMPLER_, FILTER_MODE);
-#if PYOPENCL_CL_VERSION >= 0x2000
-    ADD_ATTR("sampler_info", SAMPLER_, MIP_FILTER_MODE);
-    ADD_ATTR("sampler_info", SAMPLER_, LOD_MIN);
-    ADD_ATTR("sampler_info", SAMPLER_, LOD_MAX);
-#endif
-
-
-    // map_flags
-    ADD_ATTR("map_flags", MAP_, READ);
-    ADD_ATTR("map_flags", MAP_, WRITE);
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("map_flags", MAP_, WRITE_INVALIDATE_REGION);
-#endif
-
-
-    // program_info
-    ADD_ATTR("program_info", PROGRAM_, REFERENCE_COUNT);
-    ADD_ATTR("program_info", PROGRAM_, CONTEXT);
-    ADD_ATTR("program_info", PROGRAM_, NUM_DEVICES);
-    ADD_ATTR("program_info", PROGRAM_, DEVICES);
-    ADD_ATTR("program_info", PROGRAM_, SOURCE);
-    ADD_ATTR("program_info", PROGRAM_, BINARY_SIZES);
-    ADD_ATTR("program_info", PROGRAM_, BINARIES);
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("program_info", PROGRAM_, NUM_KERNELS);
-    ADD_ATTR("program_info", PROGRAM_, KERNEL_NAMES);
-#endif
-
-
-    // program_build_info
-    ADD_ATTR("program_build_info", PROGRAM_BUILD_, STATUS);
-    ADD_ATTR("program_build_info", PROGRAM_BUILD_, OPTIONS);
-    ADD_ATTR("program_build_info", PROGRAM_BUILD_, LOG);
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("program_build_info", PROGRAM_, BINARY_TYPE);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x2000
-    ADD_ATTR("program_build_info", PROGRAM_BUILD_, GLOBAL_VARIABLE_TOTAL_SIZE);
-#endif
-
-
-    // program_binary_type
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("program_binary_type", PROGRAM_BINARY_TYPE_, NONE);
-    ADD_ATTR("program_binary_type", PROGRAM_BINARY_TYPE_, COMPILED_OBJECT);
-    ADD_ATTR("program_binary_type", PROGRAM_BINARY_TYPE_, LIBRARY);
-    ADD_ATTR("program_binary_type", PROGRAM_BINARY_TYPE_, EXECUTABLE);
-#endif
-
-
-    // kernel_info
-    ADD_ATTR("kernel_info", KERNEL_, FUNCTION_NAME);
-    ADD_ATTR("kernel_info", KERNEL_, NUM_ARGS);
-    ADD_ATTR("kernel_info", KERNEL_, REFERENCE_COUNT);
-    ADD_ATTR("kernel_info", KERNEL_, CONTEXT);
-    ADD_ATTR("kernel_info", KERNEL_, PROGRAM);
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("kernel_info", KERNEL_, ATTRIBUTES);
-#endif
-
-
-    // kernel_arg_info
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("kernel_arg_info", KERNEL_ARG_, ADDRESS_QUALIFIER);
-    ADD_ATTR("kernel_arg_info", KERNEL_ARG_, ACCESS_QUALIFIER);
-    ADD_ATTR("kernel_arg_info", KERNEL_ARG_, TYPE_NAME);
-    ADD_ATTR("kernel_arg_info", KERNEL_ARG_, TYPE_QUALIFIER);
-    ADD_ATTR("kernel_arg_info", KERNEL_ARG_, NAME);
-#endif
-
-
-    // kernel_arg_address_qualifier
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("kernel_arg_address_qualifier", KERNEL_ARG_ADDRESS_, GLOBAL);
-    ADD_ATTR("kernel_arg_address_qualifier", KERNEL_ARG_ADDRESS_, LOCAL);
-    ADD_ATTR("kernel_arg_address_qualifier", KERNEL_ARG_ADDRESS_, CONSTANT);
-    ADD_ATTR("kernel_arg_address_qualifier", KERNEL_ARG_ADDRESS_, PRIVATE);
-#endif
-
-
-    // kernel_arg_access_qualifier
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("kernel_arg_access_qualifier", KERNEL_ARG_ACCESS_, READ_ONLY);
-    ADD_ATTR("kernel_arg_access_qualifier", KERNEL_ARG_ACCESS_, WRITE_ONLY);
-    ADD_ATTR("kernel_arg_access_qualifier", KERNEL_ARG_ACCESS_, READ_WRITE);
-    ADD_ATTR("kernel_arg_access_qualifier", KERNEL_ARG_ACCESS_, NONE);
-#endif
-
-
-    // kernel_arg_type_qualifier
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("kernel_arg_type_qualifier", KERNEL_ARG_TYPE_, NONE);
-    ADD_ATTR("kernel_arg_type_qualifier", KERNEL_ARG_TYPE_, CONST);
-    ADD_ATTR("kernel_arg_type_qualifier", KERNEL_ARG_TYPE_, RESTRICT);
-    ADD_ATTR("kernel_arg_type_qualifier", KERNEL_ARG_TYPE_, VOLATILE);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x2000
-    ADD_ATTR("kernel_arg_type_qualifier", KERNEL_ARG_TYPE_, PIPE);
-#endif
-
-
-    // kernel_work_group_info
-    ADD_ATTR("kernel_work_group_info", KERNEL_, WORK_GROUP_SIZE);
-    ADD_ATTR("kernel_work_group_info", KERNEL_, COMPILE_WORK_GROUP_SIZE);
-    ADD_ATTR("kernel_work_group_info", KERNEL_, LOCAL_MEM_SIZE);
-#if PYOPENCL_CL_VERSION >= 0x1010
-    ADD_ATTR("kernel_work_group_info", KERNEL_, PREFERRED_WORK_GROUP_SIZE_MULTIPLE);
-    ADD_ATTR("kernel_work_group_info", KERNEL_, PRIVATE_MEM_SIZE);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("kernel_work_group_info", KERNEL_, GLOBAL_WORK_SIZE);
-#endif
-
-
-    // event_info
-    ADD_ATTR("event_info", EVENT_, COMMAND_QUEUE);
-    ADD_ATTR("event_info", EVENT_, COMMAND_TYPE);
-    ADD_ATTR("event_info", EVENT_, REFERENCE_COUNT);
-    ADD_ATTR("event_info", EVENT_, COMMAND_EXECUTION_STATUS);
-#if PYOPENCL_CL_VERSION >= 0x1010
-    ADD_ATTR("event_info", EVENT_, CONTEXT);
-#endif
-
-
-    // command_type
-    ADD_ATTR("command_type", COMMAND_, NDRANGE_KERNEL);
-    ADD_ATTR("command_type", COMMAND_, TASK);
-    ADD_ATTR("command_type", COMMAND_, NATIVE_KERNEL);
-    ADD_ATTR("command_type", COMMAND_, READ_BUFFER);
-    ADD_ATTR("command_type", COMMAND_, WRITE_BUFFER);
-    ADD_ATTR("command_type", COMMAND_, COPY_BUFFER);
-    ADD_ATTR("command_type", COMMAND_, READ_IMAGE);
-    ADD_ATTR("command_type", COMMAND_, WRITE_IMAGE);
-    ADD_ATTR("command_type", COMMAND_, COPY_IMAGE);
-    ADD_ATTR("command_type", COMMAND_, COPY_IMAGE_TO_BUFFER);
-    ADD_ATTR("command_type", COMMAND_, COPY_BUFFER_TO_IMAGE);
-    ADD_ATTR("command_type", COMMAND_, MAP_BUFFER);
-    ADD_ATTR("command_type", COMMAND_, MAP_IMAGE);
-    ADD_ATTR("command_type", COMMAND_, UNMAP_MEM_OBJECT);
-    ADD_ATTR("command_type", COMMAND_, MARKER);
-    ADD_ATTR("command_type", COMMAND_, ACQUIRE_GL_OBJECTS);
-    ADD_ATTR("command_type", COMMAND_, RELEASE_GL_OBJECTS);
-#if PYOPENCL_CL_VERSION >= 0x1010
-    ADD_ATTR("command_type", COMMAND_, READ_BUFFER_RECT);
-    ADD_ATTR("command_type", COMMAND_, WRITE_BUFFER_RECT);
-    ADD_ATTR("command_type", COMMAND_, COPY_BUFFER_RECT);
-    ADD_ATTR("command_type", COMMAND_, USER);
-#endif
-#ifdef cl_ext_migrate_memobject
-    ADD_ATTR("command_type", COMMAND_, MIGRATE_MEM_OBJECT_EXT);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("command_type", COMMAND_, BARRIER);
-    ADD_ATTR("command_type", COMMAND_, MIGRATE_MEM_OBJECTS);
-    ADD_ATTR("command_type", COMMAND_, FILL_BUFFER);
-    ADD_ATTR("command_type", COMMAND_, FILL_IMAGE);
-#endif
-#if PYOPENCL_CL_VERSION >= 0x2000
-    ADD_ATTR("command_type", COMMAND_, SVM_FREE);
-    ADD_ATTR("command_type", COMMAND_, SVM_MEMCPY);
-    ADD_ATTR("command_type", COMMAND_, SVM_MEMFILL);
-    ADD_ATTR("command_type", COMMAND_, SVM_MAP);
-    ADD_ATTR("command_type", COMMAND_, SVM_UNMAP);
-#endif
-
-
-    // command_execution_status
-    ADD_ATTR("command_execution_status", , COMPLETE);
-    ADD_ATTR("command_execution_status", , RUNNING);
-    ADD_ATTR("command_execution_status", , SUBMITTED);
-    ADD_ATTR("command_execution_status", , QUEUED);
-
-
-    // profiling_info
-    ADD_ATTR("profiling_info", PROFILING_COMMAND_, QUEUED);
-    ADD_ATTR("profiling_info", PROFILING_COMMAND_, SUBMIT);
-    ADD_ATTR("profiling_info", PROFILING_COMMAND_, START);
-    ADD_ATTR("profiling_info", PROFILING_COMMAND_, END);
-#if PYOPENCL_CL_VERSION >= 0x2000
-    ADD_ATTR("profiling_info", PROFILING_COMMAND_, COMPLETE);
-#endif
-
-
-    // mem_migration_flags
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("mem_migration_flags", MIGRATE_MEM_OBJECT_, HOST);
-    ADD_ATTR("mem_migration_flags", MIGRATE_MEM_OBJECT_, CONTENT_UNDEFINED);
-#endif
-
-
-    // mem_migration_flags_ext
-#ifdef cl_ext_migrate_memobject
-    ADD_ATTR("mem_migration_flags_ext", MIGRATE_MEM_OBJECT_, HOST, _EXT);
-
-    // As of 2018-07-11, the official headers seem to have dropped this:
-#ifdef CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED_EXT
-    ADD_ATTR("mem_migration_flags_ext", MIGRATE_MEM_OBJECT_,
-             CONTENT_UNDEFINED, _EXT);
-#endif
-
-#endif
-
-
-    // device_partition_property
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("device_partition_property", DEVICE_PARTITION_, EQUALLY);
-    ADD_ATTR("device_partition_property", DEVICE_PARTITION_, BY_COUNTS);
-    ADD_ATTR("device_partition_property", DEVICE_PARTITION_, BY_COUNTS_LIST_END);
-    ADD_ATTR("device_partition_property", DEVICE_PARTITION_, BY_AFFINITY_DOMAIN);
-#endif
-
-
-    // device_affinity_domain
-#if PYOPENCL_CL_VERSION >= 0x1020
-    ADD_ATTR("device_affinity_domain", DEVICE_AFFINITY_DOMAIN_, NUMA);
-    ADD_ATTR("device_affinity_domain", DEVICE_AFFINITY_DOMAIN_, L4_CACHE);
-    ADD_ATTR("device_affinity_domain", DEVICE_AFFINITY_DOMAIN_, L3_CACHE);
-    ADD_ATTR("device_affinity_domain", DEVICE_AFFINITY_DOMAIN_, L2_CACHE);
-    ADD_ATTR("device_affinity_domain", DEVICE_AFFINITY_DOMAIN_, L1_CACHE);
-    ADD_ATTR("device_affinity_domain", DEVICE_AFFINITY_DOMAIN_,
-             NEXT_PARTITIONABLE);
-#endif
-
-
-#ifdef HAVE_GL
-    // gl_object_type
-    ADD_ATTR("gl_object_type", GL_OBJECT_, BUFFER);
-    ADD_ATTR("gl_object_type", GL_OBJECT_, TEXTURE2D);
-    ADD_ATTR("gl_object_type", GL_OBJECT_, TEXTURE3D);
-    ADD_ATTR("gl_object_type", GL_OBJECT_, RENDERBUFFER);
-
-
-    // gl_texture_info
-    ADD_ATTR("gl_texture_info", GL_, TEXTURE_TARGET);
-    ADD_ATTR("gl_texture_info", GL_, MIPMAP_LEVEL);
-#endif
-
-
-    // migrate_mem_object_flags_ext
-#ifdef cl_ext_migrate_memobject
-    ADD_ATTR("migrate_mem_object_flags_ext", MIGRATE_MEM_OBJECT_, HOST, _EXT);
-#endif
-}
diff --git a/src/mempool.hpp b/src/mempool.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b24fcb02f8528b9e06f8011b69849a8a804b939d
--- /dev/null
+++ b/src/mempool.hpp
@@ -0,0 +1,369 @@
+// Abstract memory pool implementation
+
+#ifndef _AFJDFJSDFSD_PYGPU_HEADER_SEEN_MEMPOOL_HPP
+#define _AFJDFJSDFSD_PYGPU_HEADER_SEEN_MEMPOOL_HPP
+
+
+#include <cassert>
+#include <vector>
+#include <map>
+#include <memory>
+#include <ostream>
+#include <iostream>
+#include "wrap_cl.hpp"
+#include "bitlog.hpp"
+
+
+namespace PYGPU_PACKAGE
+{
+  template <class T>
+  inline T signed_left_shift(T x, signed shift_amount)
+  {
+    if (shift_amount < 0)
+      return x >> -shift_amount;
+    else
+      return x << shift_amount;
+  }
+
+
+
+
+  template <class T>
+  inline T signed_right_shift(T x, signed shift_amount)
+  {
+    if (shift_amount < 0)
+      return x << -shift_amount;
+    else
+      return x >> shift_amount;
+  }
+
+
+
+
+  template<class Allocator>
+  class memory_pool : noncopyable
+  {
+    public:
+      typedef typename Allocator::pointer_type pointer_type;
+      typedef typename Allocator::size_type size_type;
+
+    private:
+      typedef uint32_t bin_nr_t;
+      typedef std::vector<pointer_type> bin_t;
+
+      typedef std::map<bin_nr_t, bin_t> container_t;
+      container_t m_container;
+      typedef typename container_t::value_type bin_pair_t;
+
+      std::unique_ptr<Allocator> m_allocator;
+
+      // A held block is one that's been released by the application, but that
+      // we are keeping around to dish out again.
+      unsigned m_held_blocks;
+
+      // An active block is one that is in use by the application.
+      unsigned m_active_blocks;
+
+      bool m_stop_holding;
+      int m_trace;
+
+    public:
+      memory_pool(Allocator const &alloc=Allocator())
+        : m_allocator(alloc.copy()),
+        m_held_blocks(0), m_active_blocks(0), m_stop_holding(false),
+        m_trace(false)
+      {
+        if (m_allocator->is_deferred())
+        {
+          PyErr_WarnEx(PyExc_UserWarning, "Memory pools expect non-deferred "
+              "semantics from their allocators. You passed a deferred "
+              "allocator, i.e. an allocator whose allocations can turn out to "
+              "be unavailable long after allocation.", 1);
+        }
+      }
+
+      virtual ~memory_pool()
+      { free_held(); }
+
+      static const unsigned mantissa_bits = 2;
+      static const unsigned mantissa_mask = (1 << mantissa_bits) - 1;
+
+      static bin_nr_t bin_number(size_type size)
+      {
+        signed l = bitlog2(size);
+        size_type shifted = signed_right_shift(size, l-signed(mantissa_bits));
+        if (size && (shifted & (1 << mantissa_bits)) == 0)
+          throw std::runtime_error("memory_pool::bin_number: bitlog2 fault");
+        size_type chopped = shifted & mantissa_mask;
+        return l << mantissa_bits | chopped;
+      }
+
+      void set_trace(bool flag)
+      {
+        if (flag)
+          ++m_trace;
+        else
+          --m_trace;
+      }
+
+      static size_type alloc_size(bin_nr_t bin)
+      {
+        bin_nr_t exponent = bin >> mantissa_bits;
+        bin_nr_t mantissa = bin & mantissa_mask;
+
+        size_type ones = signed_left_shift(1,
+            signed(exponent)-signed(mantissa_bits)
+            );
+        if (ones) ones -= 1;
+
+        size_type head = signed_left_shift(
+           (1<<mantissa_bits) | mantissa,
+            signed(exponent)-signed(mantissa_bits));
+        if (ones & head)
+          throw std::runtime_error("memory_pool::alloc_size: bit-counting fault");
+        return head | ones;
+      }
+
+    protected:
+      bin_t &get_bin(bin_nr_t bin_nr)
+      {
+        typename container_t::iterator it = m_container.find(bin_nr);
+        if (it == m_container.end())
+        {
+          auto it_and_inserted = m_container.insert(std::make_pair(bin_nr, bin_t()));
+          assert(it_and_inserted.second);
+          return it_and_inserted.first->second;
+        }
+        else
+          return it->second;
+      }
+
+      void inc_held_blocks()
+      {
+        if (m_held_blocks == 0)
+          start_holding_blocks();
+        ++m_held_blocks;
+      }
+
+      void dec_held_blocks()
+      {
+        --m_held_blocks;
+        if (m_held_blocks == 0)
+          stop_holding_blocks();
+      }
+
+      virtual void start_holding_blocks()
+      { }
+
+      virtual void stop_holding_blocks()
+      { }
+
+    public:
+      pointer_type allocate(size_type size)
+      {
+        bin_nr_t bin_nr = bin_number(size);
+        bin_t &bin = get_bin(bin_nr);
+
+        if (bin.size())
+        {
+          if (m_trace)
+            std::cout
+              << "[pool] allocation of size " << size << " served from bin " << bin_nr
+              << " which contained " << bin.size() << " entries" << std::endl;
+          return pop_block_from_bin(bin, size);
+        }
+
+        size_type alloc_sz = alloc_size(bin_nr);
+
+        assert(bin_number(alloc_sz) == bin_nr);
+
+        if (m_trace)
+          std::cout << "[pool] allocation of size " << size << " required new memory" << std::endl;
+
+        try { return get_from_allocator(alloc_sz); }
+        catch (PYGPU_PACKAGE::error &e)
+        {
+          if (!e.is_out_of_memory())
+            throw;
+        }
+
+        if (m_trace)
+          std::cout << "[pool] allocation triggered OOM, running GC" << std::endl;
+
+        m_allocator->try_release_blocks();
+        if (bin.size())
+          return pop_block_from_bin(bin, size);
+
+        if (m_trace)
+          std::cout << "[pool] allocation still OOM after GC" << std::endl;
+
+        while (try_to_free_memory())
+        {
+          try { return get_from_allocator(alloc_sz); }
+          catch (PYGPU_PACKAGE::error &e)
+          {
+            if (!e.is_out_of_memory())
+              throw;
+          }
+        }
+
+        throw PYGPU_PACKAGE::error(
+            "memory_pool::allocate",
+#ifdef PYGPU_PYCUDA
+            CUDA_ERROR_OUT_OF_MEMORY,
+#endif
+#ifdef PYGPU_PYOPENCL
+            CL_MEM_OBJECT_ALLOCATION_FAILURE,
+#endif
+            "failed to free memory for allocation");
+      }
+
+      void free(pointer_type p, size_type size)
+      {
+        --m_active_blocks;
+        bin_nr_t bin_nr = bin_number(size);
+
+        if (!m_stop_holding)
+        {
+          inc_held_blocks();
+          get_bin(bin_nr).push_back(p);
+
+          if (m_trace)
+            std::cout << "[pool] block of size " << size << " returned to bin "
+              << bin_nr << " which now contains " << get_bin(bin_nr).size()
+              << " entries" << std::endl;
+        }
+        else
+          m_allocator->free(p);
+      }
+
+      void free_held()
+      {
+        for (bin_pair_t &bin_pair: m_container)
+        {
+          bin_t &bin = bin_pair.second;
+
+          while (bin.size())
+          {
+            m_allocator->free(bin.back());
+            bin.pop_back();
+
+            dec_held_blocks();
+          }
+        }
+
+        assert(m_held_blocks == 0);
+      }
+
+      void stop_holding()
+      {
+        m_stop_holding = true;
+        free_held();
+      }
+
+      unsigned active_blocks()
+      { return m_active_blocks; }
+
+      unsigned held_blocks()
+      { return m_held_blocks; }
+
+      bool try_to_free_memory()
+      {
+        // free largest stuff first
+        for (bin_pair_t &bin_pair: reverse(m_container))
+        {
+          bin_t &bin = bin_pair.second;
+
+          if (bin.size())
+          {
+            m_allocator->free(bin.back());
+            bin.pop_back();
+
+            dec_held_blocks();
+
+            return true;
+          }
+        }
+
+        return false;
+      }
+
+    private:
+      pointer_type get_from_allocator(size_type alloc_sz)
+      {
+        pointer_type result = m_allocator->allocate(alloc_sz);
+        ++m_active_blocks;
+
+        return result;
+      }
+
+      pointer_type pop_block_from_bin(bin_t &bin, size_type size)
+      {
+        pointer_type result = bin.back();
+        bin.pop_back();
+
+        dec_held_blocks();
+        ++m_active_blocks;
+
+        return result;
+      }
+  };
+
+
+  template <class Pool>
+  class pooled_allocation : public noncopyable
+  {
+    public:
+      typedef Pool pool_type;
+      typedef typename Pool::pointer_type pointer_type;
+      typedef typename Pool::size_type size_type;
+
+    private:
+      std::shared_ptr<pool_type> m_pool;
+
+      pointer_type m_ptr;
+      size_type m_size;
+      bool m_valid;
+
+    public:
+      pooled_allocation(std::shared_ptr<pool_type> p, size_type size)
+        : m_pool(p), m_ptr(p->allocate(size)), m_size(size), m_valid(true)
+      { }
+
+      ~pooled_allocation()
+      {
+        if (m_valid)
+          free();
+      }
+
+      void free()
+      {
+        if (m_valid)
+        {
+          m_pool->free(m_ptr, m_size);
+          m_valid = false;
+        }
+        else
+          throw PYGPU_PACKAGE::error(
+              "pooled_device_allocation::free", 
+#ifdef PYGPU_PYCUDA
+              CUDA_ERROR_INVALID_HANDLE
+#endif
+#ifdef PYGPU_PYOPENCL
+              CL_INVALID_VALUE
+#endif
+              );
+      }
+
+      pointer_type ptr() const
+      { return m_ptr; }
+
+      size_type size() const
+      { return m_size; }
+  };
+}
+
+
+
+
+#endif
diff --git a/src/numpy_init.hpp b/src/numpy_init.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d6393f480471d5829b226bef5f0d6d61c6643d8
--- /dev/null
+++ b/src/numpy_init.hpp
@@ -0,0 +1,35 @@
+#ifndef _FAYHVVAAA_PYOPENCL_HEADER_SEEN_NUMPY_INIT_HPP
+#define _FAYHVVAAA_PYOPENCL_HEADER_SEEN_NUMPY_INIT_HPP
+
+
+// #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <numpy/arrayobject.h>
+#include <stdexcept>
+
+
+namespace
+{
+  static struct pyublas_array_importer
+  {
+    static bool do_import_array()
+    {
+#ifdef PYPY_VERSION
+      import_array();
+#else
+      import_array1(false);
+#endif
+      return true;
+    }
+
+    pyublas_array_importer()
+    {
+      if (!do_import_array())
+        throw std::runtime_error("numpy failed to initialize");
+    }
+  } _array_importer;
+}
+
+
+
+
+#endif
diff --git a/src/c_wrapper/pyopencl_ext.h b/src/pyopencl_ext.h
similarity index 100%
rename from src/c_wrapper/pyopencl_ext.h
rename to src/pyopencl_ext.h
diff --git a/src/tools.hpp b/src/tools.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..935dab7ac03daf52a0610e6a202b65503856da1c
--- /dev/null
+++ b/src/tools.hpp
@@ -0,0 +1,65 @@
+#ifndef _ASDFDAFVVAFF_PYCUDA_HEADER_SEEN_TOOLS_HPP
+#define _ASDFDAFVVAFF_PYCUDA_HEADER_SEEN_TOOLS_HPP
+
+
+#include <pybind11/pybind11.h>
+
+#include <numeric>
+#include "numpy_init.hpp"
+
+
+
+
+namespace pyopencl
+{
+  inline
+  npy_intp size_from_dims(int ndim, const npy_intp *dims)
+  {
+    if (ndim != 0)
+      return std::accumulate(dims, dims+ndim, 1, std::multiplies<npy_intp>());
+    else
+      return 1;
+  }
+
+
+
+
+  inline void run_python_gc()
+  {
+    namespace py = pybind11;
+
+    py::module::import("gc").attr("collect")();
+  }
+
+
+  // https://stackoverflow.com/a/28139075
+  template <typename T>
+  struct reversion_wrapper { T& iterable; };
+
+  template <typename T>
+  auto begin (reversion_wrapper<T> w) { return w.iterable.rbegin(); }
+
+  template <typename T>
+  auto end (reversion_wrapper<T> w) { return w.iterable.rend(); }
+
+  template <typename T>
+  reversion_wrapper<T> reverse (T&& iterable) { return { iterable }; }
+
+
+  // https://stackoverflow.com/a/44175911
+  class noncopyable {
+  public:
+    noncopyable() = default;
+    ~noncopyable() = default;
+
+  private:
+    noncopyable(const noncopyable&) = delete;
+    noncopyable& operator=(const noncopyable&) = delete;
+  };
+}
+
+
+
+
+
+#endif
diff --git a/src/wrap_cl.cpp b/src/wrap_cl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..29b546e7a07441323cdf6614907b2fe5995f5915
--- /dev/null
+++ b/src/wrap_cl.cpp
@@ -0,0 +1,24 @@
+#include "wrap_cl.hpp"
+
+
+
+
+using namespace pyopencl;
+
+
+
+
+extern void pyopencl_expose_constants(py::module &m);
+extern void pyopencl_expose_part_1(py::module &m);
+extern void pyopencl_expose_part_2(py::module &m);
+extern void pyopencl_expose_mempool(py::module &m);
+
+PYBIND11_MODULE(_cl, m)
+{
+  pyopencl_expose_constants(m);
+  pyopencl_expose_part_1(m);
+  pyopencl_expose_part_2(m);
+  pyopencl_expose_mempool(m);
+}
+
+// vim: foldmethod=marker
diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ace4bd2556a37541dcb5cfdafdc5c9df963bd31e
--- /dev/null
+++ b/src/wrap_cl.hpp
@@ -0,0 +1,4898 @@
+#ifndef _AFJHAYYTA_PYOPENCL_HEADER_SEEN_WRAP_CL_HPP
+#define _AFJHAYYTA_PYOPENCL_HEADER_SEEN_WRAP_CL_HPP
+
+// CL 1.2 undecided:
+// clSetPrintfCallback
+
+// {{{ includes
+
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+// #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+
+#ifdef __APPLE__
+
+// Mac ------------------------------------------------------------------------
+#include <OpenCL/opencl.h>
+#include "pyopencl_ext.h"
+#ifdef HAVE_GL
+
+#define PYOPENCL_GL_SHARING_VERSION 1
+
+#include <OpenGL/OpenGL.h>
+#include <OpenCL/cl_gl.h>
+#include <OpenCL/cl_gl_ext.h>
+#endif
+
+#else
+
+// elsewhere ------------------------------------------------------------------
+#define CL_TARGET_OPENCL_VERSION 220
+
+#include <CL/cl.h>
+#include "pyopencl_ext.h"
+
+#if defined(_WIN32)
+#define NOMINMAX
+#include <windows.h>
+#endif
+
+#ifdef HAVE_GL
+#include <GL/gl.h>
+#include <CL/cl_gl.h>
+#endif
+
+#if defined(cl_khr_gl_sharing) && (cl_khr_gl_sharing >= 1)
+#define PYOPENCL_GL_SHARING_VERSION cl_khr_gl_sharing
+#endif
+
+#endif
+
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+
+#include <cstdio>
+#include <stdexcept>
+#include <iostream>
+#include <vector>
+#include <utility>
+#include <numeric>
+#include "wrap_helpers.hpp"
+#include "numpy_init.hpp"
+#include "tools.hpp"
+
+#ifdef PYOPENCL_PRETEND_CL_VERSION
+#define PYOPENCL_CL_VERSION PYOPENCL_PRETEND_CL_VERSION
+#else
+
+#if defined(CL_VERSION_2_2)
+#define PYOPENCL_CL_VERSION 0x2020
+#elif defined(CL_VERSION_2_1)
+#define PYOPENCL_CL_VERSION 0x2010
+#elif defined(CL_VERSION_2_0)
+#define PYOPENCL_CL_VERSION 0x2000
+#elif defined(CL_VERSION_1_2)
+#define PYOPENCL_CL_VERSION 0x1020
+#elif defined(CL_VERSION_1_1)
+#define PYOPENCL_CL_VERSION 0x1010
+#else
+#define PYOPENCL_CL_VERSION 0x1000
+#endif
+
+#endif
+
+
+#if (PY_VERSION_HEX >= 0x03000000) or defined(PYPY_VERSION)
+#define PYOPENCL_USE_NEW_BUFFER_INTERFACE
+#define PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(s) std::move(s)
+#else
+#define PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(s) (s)
+#endif
+
+
+
+// }}}
+
+
+
+
+
+// {{{ tools
+#if PY_VERSION_HEX >= 0x02050000
+  typedef Py_ssize_t PYOPENCL_BUFFER_SIZE_T;
+#else
+  typedef int PYOPENCL_BUFFER_SIZE_T;
+#endif
+
+#define PYOPENCL_CAST_BOOL(B) ((B) ? CL_TRUE : CL_FALSE)
+
+
+
+
+
+#define PYOPENCL_DEPRECATED(WHAT, KILL_VERSION, EXTRA_MSG) \
+  { \
+    PyErr_Warn( \
+        PyExc_DeprecationWarning, \
+        WHAT " is deprecated and will stop working in PyOpenCL " KILL_VERSION". " \
+        EXTRA_MSG); \
+  }
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+
+#define PYOPENCL_GET_EXT_FUN(PLATFORM, NAME, VAR) \
+    NAME##_fn VAR \
+      = (NAME##_fn) \
+      clGetExtensionFunctionAddressForPlatform(PLATFORM, #NAME); \
+    \
+    if (!VAR) \
+      throw error(#NAME, CL_INVALID_VALUE, #NAME \
+          "not available");
+
+#else
+
+#define PYOPENCL_GET_EXT_FUN(PLATFORM, NAME, VAR) \
+    NAME##_fn VAR \
+      = (NAME##_fn) \
+      clGetExtensionFunctionAddress(#NAME); \
+    \
+    if (!VAR) \
+      throw error(#NAME, CL_INVALID_VALUE, #NAME \
+          "not available");
+
+#endif
+
+
+#define PYOPENCL_PARSE_PY_DEVICES \
+    std::vector<cl_device_id> devices_vec; \
+    cl_uint num_devices; \
+    cl_device_id *devices; \
+    \
+    if (py_devices.ptr() == Py_None) \
+    { \
+      num_devices = 0; \
+      devices = 0; \
+    } \
+    else \
+    { \
+      for (py::handle py_dev: py_devices) \
+        devices_vec.push_back( \
+            (py_dev).cast<device &>().data()); \
+      num_devices = devices_vec.size(); \
+      devices = devices_vec.empty( ) ? nullptr : &devices_vec.front(); \
+    } \
+
+
+#define PYOPENCL_RETRY_RETURN_IF_MEM_ERROR(OPERATION) \
+    try \
+    { \
+      OPERATION \
+    } \
+    catch (pyopencl::error &e) \
+    { \
+      if (!e.is_out_of_memory()) \
+        throw; \
+    } \
+    \
+    /* If we get here, we got an error from CL.
+     * We should run the Python GC to try and free up
+     * some memory references. */ \
+    run_python_gc(); \
+    \
+    /* Now retry the allocation. If it fails again,
+     * let it fail. */ \
+    { \
+      OPERATION \
+    }
+
+
+
+
+#define PYOPENCL_RETRY_IF_MEM_ERROR(OPERATION) \
+  { \
+    bool failed_with_mem_error = false; \
+    try \
+    { \
+      OPERATION \
+    } \
+    catch (pyopencl::error &e) \
+    { \
+      failed_with_mem_error = true; \
+      if (!e.is_out_of_memory()) \
+        throw; \
+    } \
+    \
+    if (failed_with_mem_error) \
+    { \
+      /* If we get here, we got an error from CL.
+       * We should run the Python GC to try and free up
+       * some memory references. */ \
+      run_python_gc(); \
+      \
+      /* Now retry the allocation. If it fails again,
+       * let it fail. */ \
+      { \
+        OPERATION \
+      } \
+    } \
+  }
+
+// }}}
+
+// {{{ tracing and error reporting
+#ifdef PYOPENCL_TRACE
+  #define PYOPENCL_PRINT_CALL_TRACE(NAME) \
+    std::cerr << NAME << std::endl;
+  #define PYOPENCL_PRINT_CALL_TRACE_INFO(NAME, EXTRA_INFO) \
+    std::cerr << NAME << " (" << EXTRA_INFO << ')' << std::endl;
+#else
+  #define PYOPENCL_PRINT_CALL_TRACE(NAME) /*nothing*/
+  #define PYOPENCL_PRINT_CALL_TRACE_INFO(NAME, EXTRA_INFO) /*nothing*/
+#endif
+
+#define PYOPENCL_CALL_GUARDED_THREADED_WITH_TRACE_INFO(NAME, ARGLIST, TRACE_INFO) \
+  { \
+    PYOPENCL_PRINT_CALL_TRACE_INFO(#NAME, TRACE_INFO); \
+    cl_int status_code; \
+    { \
+      py::gil_scoped_release release; \
+      status_code = NAME ARGLIST; \
+    } \
+    if (status_code != CL_SUCCESS) \
+      throw pyopencl::error(#NAME, status_code);\
+  }
+
+#define PYOPENCL_CALL_GUARDED_WITH_TRACE_INFO(NAME, ARGLIST, TRACE_INFO) \
+  { \
+    PYOPENCL_PRINT_CALL_TRACE_INFO(#NAME, TRACE_INFO); \
+    cl_int status_code; \
+    status_code = NAME ARGLIST; \
+    if (status_code != CL_SUCCESS) \
+      throw pyopencl::error(#NAME, status_code);\
+  }
+
+#define PYOPENCL_CALL_GUARDED_THREADED(NAME, ARGLIST) \
+  { \
+    PYOPENCL_PRINT_CALL_TRACE(#NAME); \
+    cl_int status_code; \
+    { \
+      py::gil_scoped_release release; \
+      status_code = NAME ARGLIST; \
+    } \
+    if (status_code != CL_SUCCESS) \
+      throw pyopencl::error(#NAME, status_code);\
+  }
+
+#define PYOPENCL_CALL_GUARDED(NAME, ARGLIST) \
+  { \
+    PYOPENCL_PRINT_CALL_TRACE(#NAME); \
+    cl_int status_code; \
+    status_code = NAME ARGLIST; \
+    if (status_code != CL_SUCCESS) \
+      throw pyopencl::error(#NAME, status_code);\
+  }
+#define PYOPENCL_CALL_GUARDED_CLEANUP(NAME, ARGLIST) \
+  { \
+    PYOPENCL_PRINT_CALL_TRACE(#NAME); \
+    cl_int status_code; \
+    status_code = NAME ARGLIST; \
+    if (status_code != CL_SUCCESS) \
+      std::cerr \
+        << "PyOpenCL WARNING: a clean-up operation failed (dead context maybe?)" \
+        << std::endl \
+        << #NAME " failed with code " << status_code \
+        << std::endl; \
+  }
+
+// }}}
+
+// {{{ get_info helpers
+#define PYOPENCL_GET_OPAQUE_INFO(WHAT, FIRST_ARG, SECOND_ARG, CL_TYPE, TYPE) \
+  { \
+    CL_TYPE param_value; \
+    PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \
+          (FIRST_ARG, SECOND_ARG, sizeof(param_value), &param_value, 0)); \
+    if (param_value) \
+      return py::object(handle_from_new_ptr( \
+            new TYPE(param_value, /*retain*/ true))); \
+    else \
+      return py::none(); \
+  }
+
+#define PYOPENCL_GET_VEC_INFO(WHAT, FIRST_ARG, SECOND_ARG, RES_VEC) \
+  { \
+    size_t size; \
+    PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \
+        (FIRST_ARG, SECOND_ARG, 0, 0, &size)); \
+    \
+    RES_VEC.resize(size / sizeof(RES_VEC.front())); \
+    \
+    PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \
+        (FIRST_ARG, SECOND_ARG, size, \
+         RES_VEC.empty( ) ? nullptr : &RES_VEC.front(), &size)); \
+  }
+
+#define PYOPENCL_GET_STR_INFO(WHAT, FIRST_ARG, SECOND_ARG) \
+  { \
+    size_t param_value_size; \
+    PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \
+        (FIRST_ARG, SECOND_ARG, 0, 0, &param_value_size)); \
+    \
+    std::vector<char> param_value(param_value_size); \
+    PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \
+        (FIRST_ARG, SECOND_ARG, param_value_size,  \
+         param_value.empty( ) ? nullptr : &param_value.front(), &param_value_size)); \
+    \
+    return py::cast( \
+        param_value.empty( ) ? "" : std::string(&param_value.front(), param_value_size-1)); \
+  }
+
+
+
+
+#define PYOPENCL_GET_INTEGRAL_INFO(WHAT, FIRST_ARG, SECOND_ARG, TYPE) \
+  { \
+    TYPE param_value; \
+    PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \
+        (FIRST_ARG, SECOND_ARG, sizeof(param_value), &param_value, 0)); \
+    return py::cast(param_value); \
+  }
+
+// }}}
+
+// {{{ event helpers --------------------------------------------------------------
+#define PYOPENCL_PARSE_WAIT_FOR \
+    cl_uint num_events_in_wait_list = 0; \
+    std::vector<cl_event> event_wait_list; \
+    \
+    if (py_wait_for.ptr() != Py_None) \
+    { \
+      event_wait_list.resize(len(py_wait_for)); \
+      for (py::handle evt: py_wait_for) \
+        event_wait_list[num_events_in_wait_list++] = \
+          evt.cast<const event &>().data(); \
+    }
+
+#define PYOPENCL_WAITLIST_ARGS \
+    num_events_in_wait_list, event_wait_list.empty( ) ? nullptr : &event_wait_list.front()
+
+#define PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, obj) \
+    try \
+    { \
+      return new nanny_event(evt, false, obj); \
+    } \
+    catch (...) \
+    { \
+      clReleaseEvent(evt); \
+      throw; \
+    }
+
+#define PYOPENCL_RETURN_NEW_EVENT(evt) \
+    try \
+    { \
+      return new event(evt, false); \
+    } \
+    catch (...) \
+    { \
+      clReleaseEvent(evt); \
+      throw; \
+    }
+
+// }}}
+
+// {{{ equality testing
+#define PYOPENCL_EQUALITY_TESTS(cls) \
+    bool operator==(cls const &other) const \
+    { return data() == other.data(); } \
+    bool operator!=(cls const &other) const \
+    { return data() != other.data(); } \
+    long hash() const \
+    { return (long) (intptr_t) data(); }
+// }}}
+
+
+
+namespace pyopencl
+{
+  // {{{ error
+  class error : public std::runtime_error
+  {
+    private:
+      std::string m_routine;
+      cl_int m_code;
+
+    public:
+      error(const char *routine, cl_int c, const char *msg="")
+        : std::runtime_error(msg), m_routine(routine), m_code(c)
+      { }
+
+      const std::string &routine() const
+      {
+        return m_routine;
+      }
+
+      cl_int code() const
+      {
+        return m_code;
+      }
+
+      bool is_out_of_memory() const
+      {
+        return (code() == CL_MEM_OBJECT_ALLOCATION_FAILURE
+            || code() == CL_OUT_OF_RESOURCES
+            || code() == CL_OUT_OF_HOST_MEMORY);
+      }
+
+  };
+
+  // }}}
+
+
+  // {{{ buffer interface helper
+  //
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+  class py_buffer_wrapper : public noncopyable
+  {
+    private:
+      bool m_initialized;
+
+    public:
+      Py_buffer m_buf;
+
+    py_buffer_wrapper()
+      : m_initialized(false)
+    {}
+
+    void get(PyObject *obj, int flags)
+    {
+#ifdef PYPY_VERSION
+      // work around https://bitbucket.org/pypy/pypy/issues/2873
+      if (flags & PyBUF_ANY_CONTIGUOUS)
+      {
+        int flags_wo_cont = flags & ~PyBUF_ANY_CONTIGUOUS;
+        if (PyObject_GetBuffer(obj, &m_buf, flags_wo_cont | PyBUF_C_CONTIGUOUS))
+        {
+          PyErr_Clear();
+          if (PyObject_GetBuffer(obj, &m_buf, flags_wo_cont | PyBUF_F_CONTIGUOUS))
+            throw py::error_already_set();
+        }
+      }
+      else
+#endif
+      if (PyObject_GetBuffer(obj, &m_buf, flags))
+        throw py::error_already_set();
+
+      m_initialized = true;
+    }
+
+    virtual ~py_buffer_wrapper()
+    {
+      if (m_initialized)
+        PyBuffer_Release(&m_buf);
+    }
+  };
+#endif
+
+  // }}}
+
+  inline
+  py::tuple get_cl_header_version()
+  {
+    return py::make_tuple(
+        PYOPENCL_CL_VERSION >> (3*4),
+        (PYOPENCL_CL_VERSION >> (1*4)) & 0xff
+        );
+  }
+
+
+  // {{{ platform
+
+  class platform : noncopyable
+  {
+    private:
+      cl_platform_id m_platform;
+
+    public:
+      platform(cl_platform_id pid)
+      : m_platform(pid)
+      { }
+
+      platform(cl_platform_id pid, bool /*retain (ignored)*/)
+      : m_platform(pid)
+      { }
+
+      cl_platform_id data() const
+      {
+        return m_platform;
+      }
+
+      PYOPENCL_EQUALITY_TESTS(platform);
+
+      py::object get_info(cl_platform_info param_name) const
+      {
+        switch (param_name)
+        {
+          case CL_PLATFORM_PROFILE:
+          case CL_PLATFORM_VERSION:
+          case CL_PLATFORM_NAME:
+          case CL_PLATFORM_VENDOR:
+#if !(defined(CL_PLATFORM_NVIDIA) && CL_PLATFORM_NVIDIA == 0x3001)
+          case CL_PLATFORM_EXTENSIONS:
+#endif
+            PYOPENCL_GET_STR_INFO(Platform, m_platform, param_name);
+
+          default:
+            throw error("Platform.get_info", CL_INVALID_VALUE);
+        }
+      }
+
+      py::list get_devices(cl_device_type devtype);
+  };
+
+
+
+
+  inline
+  py::list get_platforms()
+  {
+    cl_uint num_platforms = 0;
+    PYOPENCL_CALL_GUARDED(clGetPlatformIDs, (0, 0, &num_platforms));
+
+    std::vector<cl_platform_id> platforms(num_platforms);
+    PYOPENCL_CALL_GUARDED(clGetPlatformIDs,
+        (num_platforms, platforms.empty( ) ? nullptr : &platforms.front(), &num_platforms));
+
+    py::list result;
+    for (cl_platform_id pid: platforms)
+      result.append(handle_from_new_ptr(
+            new platform(pid)));
+
+    return result;
+  }
+
+  // }}}
+
+
+  // {{{ device
+
+  class device : noncopyable
+  {
+    public:
+      enum reference_type_t {
+        REF_NOT_OWNABLE,
+#if PYOPENCL_CL_VERSION >= 0x1020
+        REF_CL_1_2,
+#endif
+      };
+    private:
+      cl_device_id m_device;
+      reference_type_t m_ref_type;
+
+    public:
+      device(cl_device_id did)
+      : m_device(did), m_ref_type(REF_NOT_OWNABLE)
+      { }
+
+      device(cl_device_id did, bool retain, reference_type_t ref_type=REF_NOT_OWNABLE)
+      : m_device(did), m_ref_type(ref_type)
+      {
+        if (retain && ref_type != REF_NOT_OWNABLE)
+        {
+          if (false)
+          { }
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+          else if (ref_type == REF_CL_1_2)
+          {
+            PYOPENCL_CALL_GUARDED(clRetainDevice, (did));
+          }
+#endif
+
+          else
+            throw error("Device", CL_INVALID_VALUE,
+                "cannot own references to devices when device fission or CL 1.2 is not available");
+        }
+      }
+
+      ~device()
+      {
+#if PYOPENCL_CL_VERSION >= 0x1020
+        if (m_ref_type == REF_CL_1_2)
+          PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseDevice, (m_device));
+#endif
+      }
+
+      cl_device_id data() const
+      {
+        return m_device;
+      }
+
+      PYOPENCL_EQUALITY_TESTS(device);
+
+      py::object get_info(cl_device_info param_name) const
+      {
+#define DEV_GET_INT_INF(TYPE) \
+        PYOPENCL_GET_INTEGRAL_INFO(Device, m_device, param_name, TYPE);
+
+        switch (param_name)
+        {
+          case CL_DEVICE_TYPE: DEV_GET_INT_INF(cl_device_type);
+          case CL_DEVICE_VENDOR_ID: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_MAX_COMPUTE_UNITS: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_MAX_WORK_GROUP_SIZE: DEV_GET_INT_INF(size_t);
+
+          case CL_DEVICE_MAX_WORK_ITEM_SIZES:
+            {
+              std::vector<size_t> result;
+              PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result);
+              PYOPENCL_RETURN_VECTOR(size_t, result);
+            }
+
+          case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE: DEV_GET_INT_INF(cl_uint);
+
+          case CL_DEVICE_MAX_CLOCK_FREQUENCY: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_ADDRESS_BITS: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_MAX_READ_IMAGE_ARGS: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_MAX_WRITE_IMAGE_ARGS: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_MAX_MEM_ALLOC_SIZE: DEV_GET_INT_INF(cl_ulong);
+          case CL_DEVICE_IMAGE2D_MAX_WIDTH: DEV_GET_INT_INF(size_t);
+          case CL_DEVICE_IMAGE2D_MAX_HEIGHT: DEV_GET_INT_INF(size_t);
+          case CL_DEVICE_IMAGE3D_MAX_WIDTH: DEV_GET_INT_INF(size_t);
+          case CL_DEVICE_IMAGE3D_MAX_HEIGHT: DEV_GET_INT_INF(size_t);
+          case CL_DEVICE_IMAGE3D_MAX_DEPTH: DEV_GET_INT_INF(size_t);
+          case CL_DEVICE_IMAGE_SUPPORT: DEV_GET_INT_INF(cl_bool);
+          case CL_DEVICE_MAX_PARAMETER_SIZE: DEV_GET_INT_INF(size_t);
+          case CL_DEVICE_MAX_SAMPLERS: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_MEM_BASE_ADDR_ALIGN: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_SINGLE_FP_CONFIG: DEV_GET_INT_INF(cl_device_fp_config);
+#ifdef CL_DEVICE_DOUBLE_FP_CONFIG
+          case CL_DEVICE_DOUBLE_FP_CONFIG: DEV_GET_INT_INF(cl_device_fp_config);
+#endif
+#ifdef CL_DEVICE_HALF_FP_CONFIG
+          case CL_DEVICE_HALF_FP_CONFIG: DEV_GET_INT_INF(cl_device_fp_config);
+#endif
+
+          case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE: DEV_GET_INT_INF(cl_device_mem_cache_type);
+          case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_GLOBAL_MEM_CACHE_SIZE: DEV_GET_INT_INF(cl_ulong);
+          case CL_DEVICE_GLOBAL_MEM_SIZE: DEV_GET_INT_INF(cl_ulong);
+
+          case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: DEV_GET_INT_INF(cl_ulong);
+          case CL_DEVICE_MAX_CONSTANT_ARGS: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_LOCAL_MEM_TYPE: DEV_GET_INT_INF(cl_device_local_mem_type);
+          case CL_DEVICE_LOCAL_MEM_SIZE: DEV_GET_INT_INF(cl_ulong);
+          case CL_DEVICE_ERROR_CORRECTION_SUPPORT: DEV_GET_INT_INF(cl_bool);
+          case CL_DEVICE_PROFILING_TIMER_RESOLUTION: DEV_GET_INT_INF(size_t);
+          case CL_DEVICE_ENDIAN_LITTLE: DEV_GET_INT_INF(cl_bool);
+          case CL_DEVICE_AVAILABLE: DEV_GET_INT_INF(cl_bool);
+          case CL_DEVICE_COMPILER_AVAILABLE: DEV_GET_INT_INF(cl_bool);
+          case CL_DEVICE_EXECUTION_CAPABILITIES: DEV_GET_INT_INF(cl_device_exec_capabilities);
+#if PYOPENCL_CL_VERSION >= 0x2000
+          case CL_DEVICE_QUEUE_ON_HOST_PROPERTIES: DEV_GET_INT_INF(cl_command_queue_properties);
+#else
+          case CL_DEVICE_QUEUE_PROPERTIES: DEV_GET_INT_INF(cl_command_queue_properties);
+#endif
+
+          case CL_DEVICE_NAME:
+          case CL_DEVICE_VENDOR:
+          case CL_DRIVER_VERSION:
+          case CL_DEVICE_PROFILE:
+          case CL_DEVICE_VERSION:
+          case CL_DEVICE_EXTENSIONS:
+            PYOPENCL_GET_STR_INFO(Device, m_device, param_name);
+
+          case CL_DEVICE_PLATFORM:
+            PYOPENCL_GET_OPAQUE_INFO(Device, m_device, param_name, cl_platform_id, platform);
+
+#if PYOPENCL_CL_VERSION >= 0x1010
+          case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF: DEV_GET_INT_INF(cl_uint);
+
+          case CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_NATIVE_VECTOR_WIDTH_INT: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF: DEV_GET_INT_INF(cl_uint);
+
+          case CL_DEVICE_HOST_UNIFIED_MEMORY: DEV_GET_INT_INF(cl_bool);
+          case CL_DEVICE_OPENCL_C_VERSION:
+            PYOPENCL_GET_STR_INFO(Device, m_device, param_name);
+#endif
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+          case CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV:
+          case CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV:
+          case CL_DEVICE_REGISTERS_PER_BLOCK_NV:
+          case CL_DEVICE_WARP_SIZE_NV:
+            DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_GPU_OVERLAP_NV:
+          case CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV:
+          case CL_DEVICE_INTEGRATED_MEMORY_NV:
+            DEV_GET_INT_INF(cl_bool);
+#endif
+#ifdef CL_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT_NV
+          case CL_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT_NV:
+            DEV_GET_INT_INF(cl_uint);
+#endif
+#ifdef CL_DEVICE_PCI_BUS_ID_NV
+          case CL_DEVICE_PCI_BUS_ID_NV:
+            DEV_GET_INT_INF(cl_uint);
+#endif
+#ifdef CL_DEVICE_PCI_SLOT_ID_NV
+          case CL_DEVICE_PCI_SLOT_ID_NV:
+            DEV_GET_INT_INF(cl_uint);
+#endif
+#ifdef CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD
+          case CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD: DEV_GET_INT_INF(cl_bool);
+#endif
+#ifdef CL_DEVICE_GFXIP_MAJOR_AMD
+          case CL_DEVICE_GFXIP_MAJOR_AMD: DEV_GET_INT_INF(cl_uint);
+#endif
+#ifdef CL_DEVICE_GFXIP_MINOR_AMD
+          case CL_DEVICE_GFXIP_MINOR_AMD: DEV_GET_INT_INF(cl_uint);
+#endif
+#ifdef CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD
+          case CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD: DEV_GET_INT_INF(cl_uint);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x1020
+          case CL_DEVICE_LINKER_AVAILABLE: DEV_GET_INT_INF(cl_bool);
+          case CL_DEVICE_BUILT_IN_KERNELS:
+            PYOPENCL_GET_STR_INFO(Device, m_device, param_name);
+          case CL_DEVICE_IMAGE_MAX_BUFFER_SIZE: DEV_GET_INT_INF(size_t);
+          case CL_DEVICE_IMAGE_MAX_ARRAY_SIZE: DEV_GET_INT_INF(size_t);
+          case CL_DEVICE_PARENT_DEVICE:
+            PYOPENCL_GET_OPAQUE_INFO(Device, m_device, param_name, cl_device_id, device);
+          case CL_DEVICE_PARTITION_MAX_SUB_DEVICES: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_PARTITION_TYPE:
+          case CL_DEVICE_PARTITION_PROPERTIES:
+            {
+              std::vector<cl_device_partition_property> result;
+              PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result);
+              PYOPENCL_RETURN_VECTOR(cl_device_partition_property, result);
+            }
+          case CL_DEVICE_PARTITION_AFFINITY_DOMAIN:
+            {
+#if defined(__GNUG__) && !defined(__clang__)
+#pragma GCC diagnostic push
+// what's being ignored here is an alignment attribute to native size, which
+// shouldn't matter on the relevant ABIs that I'm aware of.
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+              std::vector<cl_device_affinity_domain> result;
+#if defined(__GNUG__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+              PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result);
+              PYOPENCL_RETURN_VECTOR(cl_device_affinity_domain, result);
+            }
+          case CL_DEVICE_REFERENCE_COUNT: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_PREFERRED_INTEROP_USER_SYNC: DEV_GET_INT_INF(cl_bool);
+          case CL_DEVICE_PRINTF_BUFFER_SIZE: DEV_GET_INT_INF(cl_bool);
+#endif
+// {{{ AMD dev attrs cl_amd_device_attribute_query
+//
+// types of AMD dev attrs divined from
+// https://www.khronos.org/registry/cl/api/1.2/cl.hpp
+#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
+          case CL_DEVICE_PROFILING_TIMER_OFFSET_AMD: DEV_GET_INT_INF(cl_ulong);
+#endif
+/* FIXME
+#ifdef CL_DEVICE_TOPOLOGY_AMD
+          case CL_DEVICE_TOPOLOGY_AMD:
+#endif
+*/
+#ifdef CL_DEVICE_BOARD_NAME_AMD
+          case CL_DEVICE_BOARD_NAME_AMD: ;
+            PYOPENCL_GET_STR_INFO(Device, m_device, param_name);
+#endif
+#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
+          case CL_DEVICE_GLOBAL_FREE_MEMORY_AMD:
+            {
+              std::vector<size_t> result;
+              PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result);
+              PYOPENCL_RETURN_VECTOR(size_t, result);
+            }
+#endif
+#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
+          case CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD: DEV_GET_INT_INF(cl_uint);
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
+          case CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD: DEV_GET_INT_INF(cl_uint);
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
+          case CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD: DEV_GET_INT_INF(cl_uint);
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
+          case CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD: DEV_GET_INT_INF(cl_uint);
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
+          case CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD: DEV_GET_INT_INF(cl_uint);
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
+          case CL_DEVICE_LOCAL_MEM_BANKS_AMD: DEV_GET_INT_INF(cl_uint);
+#endif
+// }}}
+
+#ifdef CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT
+          case CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT: DEV_GET_INT_INF(cl_uint);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2000
+          case CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE: DEV_GET_INT_INF(size_t);
+          case CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES: DEV_GET_INT_INF(cl_command_queue_properties);
+          case CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_MAX_ON_DEVICE_QUEUES: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_MAX_ON_DEVICE_EVENTS: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_SVM_CAPABILITIES: DEV_GET_INT_INF(cl_device_svm_capabilities);
+          case CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE: DEV_GET_INT_INF(size_t);
+          case CL_DEVICE_MAX_PIPE_ARGS: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_PIPE_MAX_PACKET_SIZE: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT: DEV_GET_INT_INF(cl_uint);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2010
+          case CL_DEVICE_IL_VERSION:
+            PYOPENCL_GET_STR_INFO(Device, m_device, param_name);
+          case CL_DEVICE_MAX_NUM_SUB_GROUPS: DEV_GET_INT_INF(cl_uint);
+          case CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: DEV_GET_INT_INF(cl_bool);
+#endif
+#ifdef CL_DEVICE_ME_VERSION_INTEL
+          case CL_DEVICE_ME_VERSION_INTEL: DEV_GET_INT_INF(cl_uint);
+#endif
+#ifdef CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM
+          case CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM: DEV_GET_INT_INF(cl_uint);
+#endif
+#ifdef CL_DEVICE_PAGE_SIZE_QCOM
+          case CL_DEVICE_PAGE_SIZE_QCOM: DEV_GET_INT_INF(cl_uint);
+#endif
+#ifdef CL_DEVICE_SPIR_VERSIONS
+          case CL_DEVICE_SPIR_VERSIONS:
+            PYOPENCL_GET_STR_INFO(Device, m_device, param_name);
+#endif
+#ifdef CL_DEVICE_CORE_TEMPERATURE_ALTERA
+          case CL_DEVICE_CORE_TEMPERATURE_ALTERA: DEV_GET_INT_INF(cl_int);
+#endif
+
+#ifdef CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL
+          case CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL:
+            {
+              std::vector<cl_uint> result;
+              PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result);
+              PYOPENCL_RETURN_VECTOR(cl_uint, result);
+            }
+#endif
+#ifdef CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL
+          case CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL: DEV_GET_INT_INF(cl_uint);
+#endif
+
+          default:
+            throw error("Device.get_info", CL_INVALID_VALUE);
+        }
+      }
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+      py::list create_sub_devices(py::object py_properties)
+      {
+        std::vector<cl_device_partition_property> properties;
+
+        COPY_PY_LIST(cl_device_partition_property, properties);
+        properties.push_back(0);
+
+        cl_device_partition_property *props_ptr
+          = properties.empty( ) ? nullptr : &properties.front();
+
+        cl_uint num_entries;
+        PYOPENCL_CALL_GUARDED(clCreateSubDevices,
+            (m_device, props_ptr, 0, nullptr, &num_entries));
+
+        std::vector<cl_device_id> result;
+        result.resize(num_entries);
+
+        PYOPENCL_CALL_GUARDED(clCreateSubDevices,
+            (m_device, props_ptr, num_entries, &result.front(), nullptr));
+
+        py::list py_result;
+        for (cl_device_id did: result)
+          py_result.append(handle_from_new_ptr(
+                new pyopencl::device(did, /*retain*/true,
+                  device::REF_CL_1_2)));
+        return py_result;
+      }
+#endif
+
+  };
+
+
+
+
+  inline py::list platform::get_devices(cl_device_type devtype)
+  {
+    cl_uint num_devices = 0;
+    PYOPENCL_PRINT_CALL_TRACE("clGetDeviceIDs");
+    {
+      cl_int status_code;
+      status_code = clGetDeviceIDs(m_platform, devtype, 0, 0, &num_devices);
+      if (status_code == CL_DEVICE_NOT_FOUND)
+        num_devices = 0;
+      else if (status_code != CL_SUCCESS) \
+        throw pyopencl::error("clGetDeviceIDs", status_code);
+    }
+
+    if (num_devices == 0)
+      return py::list();
+
+    std::vector<cl_device_id> devices(num_devices);
+    PYOPENCL_CALL_GUARDED(clGetDeviceIDs,
+        (m_platform, devtype,
+         num_devices, devices.empty( ) ? nullptr : &devices.front(), &num_devices));
+
+    py::list result;
+    for (cl_device_id did: devices)
+      result.append(handle_from_new_ptr(
+            new device(did)));
+
+    return result;
+  }
+
+  // }}}
+
+
+  // {{{ context
+
+  class context : public noncopyable
+  {
+    private:
+      cl_context m_context;
+
+    public:
+      context(cl_context ctx, bool retain)
+        : m_context(ctx)
+      {
+        if (retain)
+          PYOPENCL_CALL_GUARDED(clRetainContext, (ctx));
+      }
+
+      ~context()
+      {
+        PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseContext,
+            (m_context));
+      }
+
+      cl_context data() const
+      {
+        return m_context;
+      }
+
+      PYOPENCL_EQUALITY_TESTS(context);
+
+      py::object get_info(cl_context_info param_name) const
+      {
+        switch (param_name)
+        {
+          case CL_CONTEXT_REFERENCE_COUNT:
+            PYOPENCL_GET_INTEGRAL_INFO(
+                Context, m_context, param_name, cl_uint);
+
+          case CL_CONTEXT_DEVICES:
+            {
+              std::vector<cl_device_id> result;
+              PYOPENCL_GET_VEC_INFO(Context, m_context, param_name, result);
+
+              py::list py_result;
+              for (cl_device_id did: result)
+                py_result.append(handle_from_new_ptr(
+                      new pyopencl::device(did)));
+              return py_result;
+            }
+
+          case CL_CONTEXT_PROPERTIES:
+            {
+              std::vector<cl_context_properties> result;
+              PYOPENCL_GET_VEC_INFO(Context, m_context, param_name, result);
+
+              py::list py_result;
+              for (size_t i = 0; i < result.size(); i+=2)
+              {
+                cl_context_properties key = result[i];
+                py::object value;
+                switch (key)
+                {
+                  case CL_CONTEXT_PLATFORM:
+                    {
+                      value = py::object(
+                          handle_from_new_ptr(new platform(
+                            reinterpret_cast<cl_platform_id>(result[i+1]))));
+                      break;
+                    }
+
+#if defined(PYOPENCL_GL_SHARING_VERSION) && (PYOPENCL_GL_SHARING_VERSION >= 1)
+#if defined(__APPLE__) && defined(HAVE_GL)
+                  case CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE:
+#else
+                  case CL_GL_CONTEXT_KHR:
+                  case CL_EGL_DISPLAY_KHR:
+                  case CL_GLX_DISPLAY_KHR:
+                  case CL_WGL_HDC_KHR:
+                  case CL_CGL_SHAREGROUP_KHR:
+#endif
+                    value = py::cast(result[i+1]);
+                    break;
+
+#endif
+                  case 0:
+                    break;
+
+                  default:
+                    throw error("Context.get_info", CL_INVALID_VALUE,
+                        "unknown context_property key encountered");
+                }
+
+                py_result.append(py::make_tuple(result[i], value));
+              }
+              return py_result;
+            }
+
+#if PYOPENCL_CL_VERSION >= 0x1010
+          case CL_CONTEXT_NUM_DEVICES:
+            PYOPENCL_GET_INTEGRAL_INFO(
+                Context, m_context, param_name, cl_uint);
+#endif
+
+          default:
+            throw error("Context.get_info", CL_INVALID_VALUE);
+        }
+      }
+
+
+      // not exposed to python
+      int get_hex_platform_version() const
+      {
+        std::vector<cl_device_id> devices;
+        PYOPENCL_GET_VEC_INFO(Context, m_context, CL_CONTEXT_DEVICES, devices);
+
+        if (devices.size() == 0)
+          throw error("Context._get_hex_version", CL_INVALID_VALUE,
+              "platform has no devices");
+
+        cl_platform_id plat;
+
+        PYOPENCL_CALL_GUARDED(clGetDeviceInfo,
+            (devices[0], CL_DEVICE_PLATFORM, sizeof(plat), &plat, nullptr));
+
+        std::string plat_version;
+        {
+          size_t param_value_size;
+          PYOPENCL_CALL_GUARDED(clGetPlatformInfo,
+              (plat, CL_PLATFORM_VERSION, 0, 0, &param_value_size));
+
+          std::vector<char> param_value(param_value_size);
+          PYOPENCL_CALL_GUARDED(clGetPlatformInfo,
+              (plat, CL_PLATFORM_VERSION, param_value_size,
+               param_value.empty( ) ? nullptr : &param_value.front(), &param_value_size));
+
+          plat_version =
+              param_value.empty( ) ? "" : std::string(&param_value.front(), param_value_size-1);
+        }
+
+        int major_ver, minor_ver;
+        errno = 0;
+        int match_count = sscanf(plat_version.c_str(), "OpenCL %d.%d ", &major_ver, &minor_ver);
+        if (errno || match_count != 2)
+          throw error("Context._get_hex_version", CL_INVALID_VALUE,
+              "Platform version string did not have expected format");
+
+        return major_ver << 12 | minor_ver << 4;
+      }
+  };
+
+
+  inline
+  std::vector<cl_context_properties> parse_context_properties(
+      py::object py_properties)
+  {
+    std::vector<cl_context_properties> props;
+
+    if (py_properties.ptr() != Py_None)
+    {
+      for (py::handle prop_tuple_py: py_properties)
+      {
+        py::tuple prop_tuple(prop_tuple_py.cast<py::tuple>());
+
+        if (len(prop_tuple) != 2)
+          throw error("Context", CL_INVALID_VALUE, "property tuple must have length 2");
+        cl_context_properties prop = prop_tuple[0].cast<cl_context_properties>();
+        props.push_back(prop);
+
+        if (prop == CL_CONTEXT_PLATFORM)
+        {
+          props.push_back(
+              reinterpret_cast<cl_context_properties>(
+                prop_tuple[1].cast<const platform &>().data()));
+        }
+#if defined(PYOPENCL_GL_SHARING_VERSION) && (PYOPENCL_GL_SHARING_VERSION >= 1)
+#if defined(_WIN32)
+       else if (prop == CL_WGL_HDC_KHR)
+       {
+         // size_t is a stand-in for HANDLE, hopefully has the same size.
+         size_t hnd = (prop_tuple[1]).cast<size_t>();
+         props.push_back(hnd);
+       }
+#endif
+       else if (
+#if defined(__APPLE__) && defined(HAVE_GL)
+            prop == CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE
+#else
+            prop == CL_GL_CONTEXT_KHR
+            || prop == CL_EGL_DISPLAY_KHR
+            || prop == CL_GLX_DISPLAY_KHR
+            || prop == CL_CGL_SHAREGROUP_KHR
+#endif
+           )
+       {
+          py::object ctypes = py::module::import("ctypes");
+          py::object prop = prop_tuple[1], c_void_p = ctypes.attr("c_void_p");
+          py::object ptr = ctypes.attr("cast")(prop, c_void_p);
+          props.push_back(ptr.attr("value").cast<cl_context_properties>());
+       }
+#endif
+        else
+          throw error("Context", CL_INVALID_VALUE, "invalid context property");
+      }
+      props.push_back(0);
+    }
+
+    return props;
+  }
+
+
+  inline
+  context *create_context_inner(py::object py_devices, py::object py_properties,
+      py::object py_dev_type)
+  {
+    std::vector<cl_context_properties> props
+      = parse_context_properties(py_properties);
+
+    cl_context_properties *props_ptr
+      = props.empty( ) ? nullptr : &props.front();
+
+    cl_int status_code;
+
+    cl_context ctx;
+
+    // from device list
+    if (py_devices.ptr() != Py_None)
+    {
+      if (py_dev_type.ptr() != Py_None)
+        throw error("Context", CL_INVALID_VALUE,
+            "one of 'devices' or 'dev_type' must be None");
+
+      std::vector<cl_device_id> devices;
+      for (py::handle py_dev: py_devices)
+        devices.push_back(py_dev.cast<const device &>().data());
+
+      PYOPENCL_PRINT_CALL_TRACE("clCreateContext");
+      ctx = clCreateContext(
+          props_ptr,
+          devices.size(),
+          devices.empty( ) ? nullptr : &devices.front(),
+          0, 0, &status_code);
+    }
+    // from dev_type
+    else
+    {
+      cl_device_type dev_type = CL_DEVICE_TYPE_DEFAULT;
+      if (py_dev_type.ptr() != Py_None)
+        dev_type = py_dev_type.cast<cl_device_type>();
+
+      PYOPENCL_PRINT_CALL_TRACE("clCreateContextFromType");
+      ctx = clCreateContextFromType(props_ptr, dev_type, 0, 0, &status_code);
+    }
+
+    if (status_code != CL_SUCCESS)
+      throw pyopencl::error("Context", status_code);
+
+    try
+    {
+      return new context(ctx, false);
+    }
+    catch (...)
+    {
+      PYOPENCL_CALL_GUARDED(clReleaseContext, (ctx));
+      throw;
+    }
+  }
+
+
+  inline
+  context *create_context(py::object py_devices, py::object py_properties,
+      py::object py_dev_type)
+  {
+    PYOPENCL_RETRY_RETURN_IF_MEM_ERROR(
+      return create_context_inner(py_devices, py_properties, py_dev_type);
+    )
+  }
+
+  // }}}
+
+
+  // {{{ command_queue
+
+  class command_queue
+  {
+    private:
+      cl_command_queue m_queue;
+
+    public:
+      command_queue(cl_command_queue q, bool retain)
+        : m_queue(q)
+      {
+        if (retain)
+          PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (q));
+      }
+
+      command_queue(command_queue const &src)
+        : m_queue(src.m_queue)
+      {
+        PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (m_queue));
+      }
+
+      command_queue(
+          const context &ctx,
+          const device *py_dev=nullptr,
+          py::object py_props=py::none())
+      {
+        cl_device_id dev;
+        if (py_dev)
+          dev = py_dev->data();
+        else
+        {
+          std::vector<cl_device_id> devs;
+          PYOPENCL_GET_VEC_INFO(Context, ctx.data(), CL_CONTEXT_DEVICES, devs);
+          if (devs.size() == 0)
+            throw pyopencl::error("CommandQueue", CL_INVALID_VALUE,
+                "context doesn't have any devices? -- don't know which one to default to");
+          dev = devs[0];
+        }
+
+        int hex_plat_version = ctx.get_hex_platform_version();
+
+        bool props_given_as_numeric;
+        cl_command_queue_properties num_props;
+        if (py_props.is_none())
+        {
+          num_props = 0;
+          props_given_as_numeric = true;
+        }
+        else
+        {
+          try
+          {
+            num_props = py::cast<cl_command_queue_properties>(py_props);
+            props_given_as_numeric = true;
+          }
+          catch (py::cast_error &)
+          {
+            props_given_as_numeric = false;
+          }
+        }
+
+        if (props_given_as_numeric)
+        {
+#if PYOPENCL_CL_VERSION >= 0x2000
+          if (hex_plat_version  >= 0x2000)
+          {
+            cl_queue_properties props_list[] = { CL_QUEUE_PROPERTIES, num_props, 0 };
+
+            cl_int status_code;
+
+            PYOPENCL_PRINT_CALL_TRACE("clCreateCommandQueueWithProperties");
+            m_queue = clCreateCommandQueueWithProperties(
+                ctx.data(), dev, props_list, &status_code);
+
+            if (status_code != CL_SUCCESS)
+              throw pyopencl::error("CommandQueue", status_code);
+          }
+          else
+#endif
+          {
+            cl_int status_code;
+
+            PYOPENCL_PRINT_CALL_TRACE("clCreateCommandQueue");
+#if defined(__GNUG__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+            m_queue = clCreateCommandQueue(
+                ctx.data(), dev, num_props, &status_code);
+#if defined(__GNUG__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+            if (status_code != CL_SUCCESS)
+              throw pyopencl::error("CommandQueue", status_code);
+          }
+        }
+        else
+        {
+#if PYOPENCL_CL_VERSION >= 0x2000
+            throw error("CommandQueue", CL_INVALID_VALUE,
+                "queue properties given as an iterable, "
+                "which is only allowed when PyOpenCL was built "
+                "against an OpenCL 2+ header");
+
+          if (hex_plat_version  < 0x2000)
+          {
+            std::cerr <<
+                "queue properties given as an iterable, "
+                "which uses an OpenCL 2+-only interface, "
+                "but the context's platform does not "
+                "declare OpenCL 2 support. Proceeding "
+                "as requested, but the next thing you see "
+                "may be a crash." << std:: endl;
+          }
+
+          cl_queue_properties props[py::len(py_props) + 1];
+          {
+            size_t i = 0;
+            for (auto prop: py_props)
+              props[i++] = py::cast<cl_queue_properties>(prop);
+            props[i++] = 0;
+          }
+
+          cl_int status_code;
+          PYOPENCL_PRINT_CALL_TRACE("clCreateCommandQueueWithProperties");
+          m_queue = clCreateCommandQueueWithProperties(
+              ctx.data(), dev, props, &status_code);
+
+          if (status_code != CL_SUCCESS)
+            throw pyopencl::error("CommandQueue", status_code);
+#endif
+        }
+      }
+
+      ~command_queue()
+      {
+        PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseCommandQueue,
+            (m_queue));
+      }
+
+      const cl_command_queue data() const
+      { return m_queue; }
+
+      PYOPENCL_EQUALITY_TESTS(command_queue);
+
+      py::object get_info(cl_command_queue_info param_name) const
+      {
+        switch (param_name)
+        {
+          case CL_QUEUE_CONTEXT:
+            PYOPENCL_GET_OPAQUE_INFO(CommandQueue, m_queue, param_name,
+                cl_context, context);
+          case CL_QUEUE_DEVICE:
+            PYOPENCL_GET_OPAQUE_INFO(CommandQueue, m_queue, param_name,
+                cl_device_id, device);
+          case CL_QUEUE_REFERENCE_COUNT:
+            PYOPENCL_GET_INTEGRAL_INFO(CommandQueue, m_queue, param_name,
+                cl_uint);
+          case CL_QUEUE_PROPERTIES:
+            PYOPENCL_GET_INTEGRAL_INFO(CommandQueue, m_queue, param_name,
+                cl_command_queue_properties);
+
+          default:
+            throw error("CommandQueue.get_info", CL_INVALID_VALUE);
+        }
+      }
+
+      std::unique_ptr<context> get_context() const
+      {
+        cl_context param_value;
+        PYOPENCL_CALL_GUARDED(clGetCommandQueueInfo,
+            (m_queue, CL_QUEUE_CONTEXT, sizeof(param_value), &param_value, 0));
+        return std::unique_ptr<context>(
+            new context(param_value, /*retain*/ true));
+      }
+
+#if PYOPENCL_CL_VERSION < 0x1010
+      cl_command_queue_properties set_property(
+          cl_command_queue_properties prop,
+          bool enable)
+      {
+        cl_command_queue_properties old_prop;
+        PYOPENCL_CALL_GUARDED(clSetCommandQueueProperty,
+            (m_queue, prop, PYOPENCL_CAST_BOOL(enable), &old_prop));
+        return old_prop;
+      }
+#endif
+
+      void flush()
+      { PYOPENCL_CALL_GUARDED(clFlush, (m_queue)); }
+      void finish()
+      { PYOPENCL_CALL_GUARDED_THREADED(clFinish, (m_queue)); }
+  };
+
+  // }}}
+
+
+  // {{{ event/synchronization
+
+  class event : noncopyable
+  {
+    private:
+      cl_event m_event;
+
+    public:
+      event(cl_event event, bool retain)
+        : m_event(event)
+      {
+        if (retain)
+          PYOPENCL_CALL_GUARDED(clRetainEvent, (event));
+      }
+
+      event(event const &src)
+        : m_event(src.m_event)
+      { PYOPENCL_CALL_GUARDED(clRetainEvent, (m_event)); }
+
+      virtual ~event()
+      {
+        PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseEvent,
+            (m_event));
+      }
+
+      const cl_event data() const
+      { return m_event; }
+
+      PYOPENCL_EQUALITY_TESTS(event);
+
+      py::object get_info(cl_event_info param_name) const
+      {
+        switch (param_name)
+        {
+          case CL_EVENT_COMMAND_QUEUE:
+            PYOPENCL_GET_OPAQUE_INFO(Event, m_event, param_name,
+                cl_command_queue, command_queue);
+          case CL_EVENT_COMMAND_TYPE:
+            PYOPENCL_GET_INTEGRAL_INFO(Event, m_event, param_name,
+                cl_command_type);
+          case CL_EVENT_COMMAND_EXECUTION_STATUS:
+            PYOPENCL_GET_INTEGRAL_INFO(Event, m_event, param_name,
+                cl_int);
+          case CL_EVENT_REFERENCE_COUNT:
+            PYOPENCL_GET_INTEGRAL_INFO(Event, m_event, param_name,
+                cl_uint);
+#if PYOPENCL_CL_VERSION >= 0x1010
+          case CL_EVENT_CONTEXT:
+            PYOPENCL_GET_OPAQUE_INFO(Event, m_event, param_name,
+                cl_context, context);
+#endif
+
+          default:
+            throw error("Event.get_info", CL_INVALID_VALUE);
+        }
+      }
+
+      py::object get_profiling_info(cl_profiling_info param_name) const
+      {
+        switch (param_name)
+        {
+          case CL_PROFILING_COMMAND_QUEUED:
+          case CL_PROFILING_COMMAND_SUBMIT:
+          case CL_PROFILING_COMMAND_START:
+          case CL_PROFILING_COMMAND_END:
+#if PYOPENCL_CL_VERSION >= 0x2000
+          case CL_PROFILING_COMMAND_COMPLETE:
+#endif
+            PYOPENCL_GET_INTEGRAL_INFO(EventProfiling, m_event, param_name,
+                cl_ulong);
+          default:
+            throw error("Event.get_profiling_info", CL_INVALID_VALUE);
+        }
+      }
+
+      virtual void wait()
+      {
+        PYOPENCL_CALL_GUARDED_THREADED(clWaitForEvents, (1, &m_event));
+      }
+
+#if PYOPENCL_CL_VERSION >= 0x1010
+    // {{{ set_callback, by way of a a thread-based construction
+
+    private:
+      struct event_callback_info_t
+      {
+        std::mutex m_mutex;
+        std::condition_variable m_condvar;
+
+        py::object m_py_event;
+        py::object m_py_callback;
+
+        bool m_set_callback_suceeded;
+
+        bool m_notify_thread_wakeup_is_genuine;
+
+        cl_event m_event;
+        cl_int m_command_exec_status;
+
+        event_callback_info_t(py::object py_event, py::object py_callback)
+        : m_py_event(py_event), m_py_callback(py_callback), m_set_callback_suceeded(true),
+        m_notify_thread_wakeup_is_genuine(false)
+        {}
+      };
+
+      static void evt_callback(cl_event evt, cl_int command_exec_status, void *user_data)
+      {
+        event_callback_info_t *cb_info = reinterpret_cast<event_callback_info_t *>(user_data);
+        {
+          std::lock_guard<std::mutex> lg(cb_info->m_mutex);
+          cb_info->m_event = evt;
+          cb_info->m_command_exec_status = command_exec_status;
+          cb_info->m_notify_thread_wakeup_is_genuine = true;
+        }
+
+        cb_info->m_condvar.notify_one();
+      }
+
+    public:
+      void set_callback(cl_int command_exec_callback_type, py::object pfn_event_notify)
+      {
+        // The reason for doing this via a thread is that we're able to wait on
+        // acquiring the GIL. (which we can't in the callback)
+
+        std::unique_ptr<event_callback_info_t> cb_info_holder(
+            new event_callback_info_t(
+              handle_from_new_ptr(new event(*this)),
+              pfn_event_notify));
+        event_callback_info_t *cb_info = cb_info_holder.get();
+
+        std::thread notif_thread([cb_info]()
+            {
+              {
+                std::unique_lock<std::mutex> ulk(cb_info->m_mutex);
+                cb_info->m_condvar.wait(
+                    ulk,
+                    [&](){ return cb_info->m_notify_thread_wakeup_is_genuine; });
+
+                // ulk no longer held here, cb_info ready for deletion
+              }
+
+              {
+                py::gil_scoped_acquire acquire;
+
+                if (cb_info->m_set_callback_suceeded)
+                {
+                  try {
+                    cb_info->m_py_callback(
+                        // cb_info->m_py_event,
+                        cb_info->m_command_exec_status);
+                  }
+                  catch (std::exception &exc)
+                  {
+                    std::cerr
+                    << "[pyopencl] event callback handler threw an exception, ignoring: "
+                    << exc.what()
+                    << std::endl;
+                  }
+                }
+
+                // Need to hold GIL to delete py::object instances in
+                // event_callback_info_t
+                delete cb_info;
+              }
+            });
+        // Thread is away--it is now its responsibility to free cb_info.
+        cb_info_holder.release();
+
+        // notif_thread should no longer be coupled to the lifetime of the thread.
+        notif_thread.detach();
+
+        try
+        {
+          PYOPENCL_CALL_GUARDED(clSetEventCallback, (
+                data(), command_exec_callback_type, &event::evt_callback, cb_info));
+        }
+        catch (...) {
+          // Setting the callback did not succeed. The thread would never
+          // be woken up. Wake it up to let it know that it can stop.
+          {
+            std::lock_guard<std::mutex> lg(cb_info->m_mutex);
+            cb_info->m_set_callback_suceeded = false;
+            cb_info->m_notify_thread_wakeup_is_genuine = true;
+          }
+          cb_info->m_condvar.notify_one();
+          throw;
+        }
+      }
+      // }}}
+#endif
+  };
+
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+  class nanny_event : public event
+  {
+    // In addition to everything an event does, the nanny event holds a reference
+    // to a Python object and waits for its own completion upon destruction.
+
+    protected:
+      std::unique_ptr<py_buffer_wrapper> m_ward;
+
+    public:
+
+      nanny_event(cl_event evt, bool retain, std::unique_ptr<py_buffer_wrapper> &ward)
+        : event(evt, retain), m_ward(std::move(ward))
+      { }
+
+      ~nanny_event()
+      { wait(); }
+
+      py::object get_ward() const
+      {
+        if (m_ward.get())
+        {
+          return py::reinterpret_borrow<py::object>(m_ward->m_buf.obj);
+        }
+        else
+          return py::none();
+      }
+
+      virtual void wait()
+      {
+        event::wait();
+        m_ward.reset();
+      }
+  };
+#else
+  class nanny_event : public event
+  {
+    // In addition to everything an event does, the nanny event holds a reference
+    // to a Python object and waits for its own completion upon destruction.
+
+    protected:
+      py::object        m_ward;
+
+    public:
+
+      nanny_event(cl_event evt, bool retain, py::object ward)
+        : event(evt, retain), m_ward(ward)
+      { }
+
+      nanny_event(nanny_event const &src)
+        : event(src), m_ward(src.m_ward)
+      { }
+
+      ~nanny_event()
+      { wait(); }
+
+      py::object get_ward() const
+      { return m_ward; }
+
+      virtual void wait()
+      {
+        event::wait();
+        m_ward = py::none();
+      }
+  };
+#endif
+
+
+
+
+  inline
+  void wait_for_events(py::object events)
+  {
+    cl_uint num_events_in_wait_list = 0;
+    std::vector<cl_event> event_wait_list(len(events));
+
+    for (py::handle evt: events)
+      event_wait_list[num_events_in_wait_list++] =
+        evt.cast<event &>().data();
+
+    PYOPENCL_CALL_GUARDED_THREADED(clWaitForEvents, (
+          PYOPENCL_WAITLIST_ARGS));
+  }
+
+
+
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+  inline
+  event *enqueue_marker_with_wait_list(command_queue &cq,
+      py::object py_wait_for)
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+    cl_event evt;
+
+    PYOPENCL_CALL_GUARDED(clEnqueueMarkerWithWaitList, (
+          cq.data(), PYOPENCL_WAITLIST_ARGS, &evt));
+
+    PYOPENCL_RETURN_NEW_EVENT(evt);
+  }
+
+  inline
+  event *enqueue_barrier_with_wait_list(command_queue &cq,
+      py::object py_wait_for)
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+    cl_event evt;
+
+    PYOPENCL_CALL_GUARDED(clEnqueueBarrierWithWaitList,
+        (cq.data(), PYOPENCL_WAITLIST_ARGS, &evt));
+
+    PYOPENCL_RETURN_NEW_EVENT(evt);
+  }
+#endif
+
+
+  // {{{ used internally for pre-OpenCL-1.2 contexts
+
+  inline
+  event *enqueue_marker(command_queue &cq)
+  {
+    cl_event evt;
+
+    PYOPENCL_CALL_GUARDED(clEnqueueMarker, (
+          cq.data(), &evt));
+
+    PYOPENCL_RETURN_NEW_EVENT(evt);
+  }
+
+  inline
+  void enqueue_wait_for_events(command_queue &cq, py::object py_events)
+  {
+    cl_uint num_events = 0;
+    std::vector<cl_event> event_list(len(py_events));
+
+    for (py::handle py_evt: py_events)
+      event_list[num_events++] = py_evt.cast<event &>().data();
+
+    PYOPENCL_CALL_GUARDED(clEnqueueWaitForEvents, (
+          cq.data(), num_events, event_list.empty( ) ? nullptr : &event_list.front()));
+  }
+
+  inline
+  void enqueue_barrier(command_queue &cq)
+  {
+    PYOPENCL_CALL_GUARDED(clEnqueueBarrier, (cq.data()));
+  }
+
+  // }}}
+
+
+#if PYOPENCL_CL_VERSION >= 0x1010
+  class user_event : public event
+  {
+    public:
+      user_event(cl_event evt, bool retain)
+        : event(evt, retain)
+      { }
+
+      void set_status(cl_int execution_status)
+      {
+        PYOPENCL_CALL_GUARDED(clSetUserEventStatus, (data(), execution_status));
+      }
+  };
+
+
+
+
+  inline
+  user_event *create_user_event(context &ctx)
+  {
+    cl_int status_code;
+    PYOPENCL_PRINT_CALL_TRACE("clCreateUserEvent");
+    cl_event evt = clCreateUserEvent(ctx.data(), &status_code);
+
+    if (status_code != CL_SUCCESS)
+      throw pyopencl::error("UserEvent", status_code);
+
+    try
+    {
+      return new user_event(evt, false);
+    }
+    catch (...)
+    {
+      clReleaseEvent(evt);
+      throw;
+    }
+  }
+
+#endif
+
+  // }}}
+
+
+  // {{{ memory_object
+
+  py::object create_mem_object_wrapper(cl_mem mem, bool retain);
+
+  class memory_object_holder
+  {
+    public:
+      virtual const cl_mem data() const = 0;
+
+      PYOPENCL_EQUALITY_TESTS(memory_object_holder);
+
+      size_t size() const
+      {
+        size_t param_value;
+        PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
+            (data(), CL_MEM_SIZE, sizeof(param_value), &param_value, 0));
+        return param_value;
+      }
+
+      py::object get_info(cl_mem_info param_name) const;
+  };
+
+
+
+
+  class memory_object : noncopyable, public memory_object_holder
+  {
+    public:
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+      typedef std::unique_ptr<py_buffer_wrapper> hostbuf_t;
+#else
+      typedef py::object hostbuf_t;
+#endif
+
+    private:
+      bool m_valid;
+      cl_mem m_mem;
+      hostbuf_t m_hostbuf;
+
+    public:
+      memory_object(cl_mem mem, bool retain, hostbuf_t hostbuf=hostbuf_t())
+        : m_valid(true), m_mem(mem)
+      {
+        if (retain)
+          PYOPENCL_CALL_GUARDED(clRetainMemObject, (mem));
+
+        m_hostbuf = PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(hostbuf);
+      }
+
+      memory_object(memory_object &src)
+        : m_valid(true), m_mem(src.m_mem),
+        m_hostbuf(PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(src.m_hostbuf))
+      {
+        PYOPENCL_CALL_GUARDED(clRetainMemObject, (m_mem));
+      }
+
+      memory_object(memory_object_holder const &src)
+        : m_valid(true), m_mem(src.data())
+      {
+        PYOPENCL_CALL_GUARDED(clRetainMemObject, (m_mem));
+      }
+
+      void release()
+      {
+        if (!m_valid)
+            throw error("MemoryObject.free", CL_INVALID_VALUE,
+                "trying to double-unref mem object");
+        PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseMemObject, (m_mem));
+        m_valid = false;
+      }
+
+      virtual ~memory_object()
+      {
+        if (m_valid)
+          release();
+      }
+
+      py::object hostbuf()
+      {
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+        if (m_hostbuf.get())
+          return py::reinterpret_borrow<py::object>(m_hostbuf->m_buf.obj);
+        else
+          return py::none();
+#else
+        return m_hostbuf;
+#endif
+      }
+
+      const cl_mem data() const
+      { return m_mem; }
+
+  };
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+  inline
+  event *enqueue_migrate_mem_objects(
+      command_queue &cq,
+      py::object py_mem_objects,
+      cl_mem_migration_flags flags,
+      py::object py_wait_for)
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+
+    std::vector<cl_mem> mem_objects;
+    for (py::handle mo: py_mem_objects)
+      mem_objects.push_back(mo.cast<const memory_object &>().data());
+
+    cl_event evt;
+    PYOPENCL_RETRY_IF_MEM_ERROR(
+      PYOPENCL_CALL_GUARDED(clEnqueueMigrateMemObjects, (
+            cq.data(),
+            mem_objects.size(), mem_objects.empty( ) ? nullptr : &mem_objects.front(),
+            flags,
+            PYOPENCL_WAITLIST_ARGS, &evt
+            ));
+      );
+    PYOPENCL_RETURN_NEW_EVENT(evt);
+  }
+#endif
+
+  // }}}
+
+
+  // {{{ buffer
+
+  inline cl_mem create_buffer(
+      cl_context ctx,
+      cl_mem_flags flags,
+      size_t size,
+      void *host_ptr)
+  {
+    cl_int status_code;
+    PYOPENCL_PRINT_CALL_TRACE("clCreateBuffer");
+    cl_mem mem = clCreateBuffer(ctx, flags, size, host_ptr, &status_code);
+
+    if (status_code != CL_SUCCESS)
+      throw pyopencl::error("create_buffer", status_code);
+
+    return mem;
+  }
+
+
+
+
+  inline cl_mem create_buffer_gc(
+      cl_context ctx,
+      cl_mem_flags flags,
+      size_t size,
+      void *host_ptr)
+  {
+    PYOPENCL_RETRY_RETURN_IF_MEM_ERROR(
+      return create_buffer(ctx, flags, size, host_ptr);
+    );
+  }
+
+
+
+#if PYOPENCL_CL_VERSION >= 0x1010
+  inline cl_mem create_sub_buffer(
+      cl_mem buffer, cl_mem_flags flags, cl_buffer_create_type bct,
+      const void *buffer_create_info)
+  {
+    cl_int status_code;
+    PYOPENCL_PRINT_CALL_TRACE("clCreateSubBuffer");
+    cl_mem mem = clCreateSubBuffer(buffer, flags,
+        bct, buffer_create_info, &status_code);
+
+    if (status_code != CL_SUCCESS)
+      throw pyopencl::error("clCreateSubBuffer", status_code);
+
+    return mem;
+  }
+
+
+
+
+  inline cl_mem create_sub_buffer_gc(
+      cl_mem buffer, cl_mem_flags flags, cl_buffer_create_type bct,
+      const void *buffer_create_info)
+  {
+    PYOPENCL_RETRY_RETURN_IF_MEM_ERROR(
+      return create_sub_buffer(buffer, flags, bct, buffer_create_info);
+    );
+  }
+#endif
+
+
+
+  class buffer : public memory_object
+  {
+    public:
+      buffer(cl_mem mem, bool retain, hostbuf_t hostbuf=hostbuf_t())
+        : memory_object(mem, retain, PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(hostbuf))
+      { }
+
+#if PYOPENCL_CL_VERSION >= 0x1010
+      buffer *get_sub_region(
+          size_t origin, size_t size, cl_mem_flags flags) const
+      {
+        cl_buffer_region region = { origin, size};
+
+        cl_mem mem = create_sub_buffer_gc(
+            data(), flags, CL_BUFFER_CREATE_TYPE_REGION, &region);
+
+        try
+        {
+          return new buffer(mem, false);
+        }
+        catch (...)
+        {
+          PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem));
+          throw;
+        }
+      }
+
+      buffer *getitem(py::slice slc) const
+      {
+        PYOPENCL_BUFFER_SIZE_T start, end, stride, length;
+
+        size_t my_length;
+        PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
+            (data(), CL_MEM_SIZE, sizeof(my_length), &my_length, 0));
+
+#if PY_VERSION_HEX >= 0x03020000
+        if (PySlice_GetIndicesEx(slc.ptr(),
+#else
+        if (PySlice_GetIndicesEx(reinterpret_cast<PySliceObject *>(slc.ptr()),
+#endif
+              my_length, &start, &end, &stride, &length) != 0)
+          throw py::error_already_set();
+
+        if (stride != 1)
+          throw pyopencl::error("Buffer.__getitem__", CL_INVALID_VALUE,
+              "Buffer slice must have stride 1");
+
+        cl_mem_flags my_flags;
+        PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
+            (data(), CL_MEM_FLAGS, sizeof(my_flags), &my_flags, 0));
+
+        my_flags &= ~CL_MEM_COPY_HOST_PTR;
+
+        if (end <= start)
+          throw pyopencl::error("Buffer.__getitem__", CL_INVALID_VALUE,
+              "Buffer slice have end > start");
+
+        return get_sub_region(start, end-start, my_flags);
+      }
+#endif
+  };
+
+  // {{{ buffer creation
+
+  inline
+  buffer *create_buffer_py(
+      context &ctx,
+      cl_mem_flags flags,
+      size_t size,
+      py::object py_hostbuf
+      )
+  {
+    if (py_hostbuf.ptr() != Py_None &&
+        !(flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)))
+      PyErr_Warn(PyExc_UserWarning, "'hostbuf' was passed, "
+          "but no memory flags to make use of it.");
+
+    void *buf = 0;
+
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+    std::unique_ptr<py_buffer_wrapper> retained_buf_obj;
+    if (py_hostbuf.ptr() != Py_None)
+    {
+      retained_buf_obj = std::unique_ptr<py_buffer_wrapper>(new py_buffer_wrapper);
+
+      int py_buf_flags = PyBUF_ANY_CONTIGUOUS;
+      if ((flags & CL_MEM_USE_HOST_PTR)
+          && ((flags & CL_MEM_READ_WRITE)
+            || (flags & CL_MEM_WRITE_ONLY)))
+        py_buf_flags |= PyBUF_WRITABLE;
+
+      retained_buf_obj->get(py_hostbuf.ptr(), py_buf_flags);
+
+      buf = retained_buf_obj->m_buf.buf;
+
+      if (size > size_t(retained_buf_obj->m_buf.len))
+        throw pyopencl::error("Buffer", CL_INVALID_VALUE,
+            "specified size is greater than host buffer size");
+      if (size == 0)
+        size = retained_buf_obj->m_buf.len;
+    }
+#else
+    py::object retained_buf_obj;
+    if (py_hostbuf.ptr() != Py_None)
+    {
+      PYOPENCL_BUFFER_SIZE_T len;
+      if ((flags & CL_MEM_USE_HOST_PTR)
+          && ((flags & CL_MEM_READ_WRITE)
+            || (flags & CL_MEM_WRITE_ONLY)))
+      {
+        if (PyObject_AsWriteBuffer(py_hostbuf.ptr(), &buf, &len))
+          throw py::error_already_set();
+      }
+      else
+      {
+        if (PyObject_AsReadBuffer(
+              py_hostbuf.ptr(), const_cast<const void **>(&buf), &len))
+          throw py::error_already_set();
+      }
+
+      if (flags & CL_MEM_USE_HOST_PTR)
+        retained_buf_obj = py_hostbuf;
+
+      if (size > size_t(len))
+        throw pyopencl::error("Buffer", CL_INVALID_VALUE,
+            "specified size is greater than host buffer size");
+      if (size == 0)
+        size = len;
+    }
+#endif
+
+    cl_mem mem = create_buffer_gc(ctx.data(), flags, size, buf);
+
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+    if (!(flags & CL_MEM_USE_HOST_PTR))
+      retained_buf_obj.reset();
+#endif
+
+    try
+    {
+      return new buffer(mem, false, PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(retained_buf_obj));
+    }
+    catch (...)
+    {
+      PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem));
+      throw;
+    }
+  }
+
+  // }}}
+
+  // {{{ buffer transfers
+
+  // {{{ byte-for-byte transfers
+
+  inline
+  event *enqueue_read_buffer(
+      command_queue &cq,
+      memory_object_holder &mem,
+      py::object buffer,
+      size_t device_offset,
+      py::object py_wait_for,
+      bool is_blocking)
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+
+    void *buf;
+    PYOPENCL_BUFFER_SIZE_T len;
+
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+    std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
+
+    ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE);
+
+    buf = ward->m_buf.buf;
+    len = ward->m_buf.len;
+#else
+    py::object ward = buffer;
+    if (PyObject_AsWriteBuffer(buffer.ptr(), &buf, &len))
+      throw py::error_already_set();
+#endif
+
+    cl_event evt;
+    PYOPENCL_RETRY_IF_MEM_ERROR(
+      PYOPENCL_CALL_GUARDED_THREADED(clEnqueueReadBuffer, (
+            cq.data(),
+            mem.data(),
+            PYOPENCL_CAST_BOOL(is_blocking),
+            device_offset, len, buf,
+            PYOPENCL_WAITLIST_ARGS, &evt
+            ))
+      );
+    PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, ward);
+  }
+
+
+
+
+  inline
+  event *enqueue_write_buffer(
+      command_queue &cq,
+      memory_object_holder &mem,
+      py::object buffer,
+      size_t device_offset,
+      py::object py_wait_for,
+      bool is_blocking)
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+
+    const void *buf;
+    PYOPENCL_BUFFER_SIZE_T len;
+
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+    std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
+
+    ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS);
+
+    buf = ward->m_buf.buf;
+    len = ward->m_buf.len;
+#else
+    py::object ward = buffer;
+    if (PyObject_AsReadBuffer(buffer.ptr(), &buf, &len))
+      throw py::error_already_set();
+#endif
+
+    cl_event evt;
+    PYOPENCL_RETRY_IF_MEM_ERROR(
+      PYOPENCL_CALL_GUARDED_THREADED(clEnqueueWriteBuffer, (
+            cq.data(),
+            mem.data(),
+            PYOPENCL_CAST_BOOL(is_blocking),
+            device_offset, len, buf,
+            PYOPENCL_WAITLIST_ARGS, &evt
+            ))
+      );
+    PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, ward);
+  }
+
+
+
+
+  inline
+  event *enqueue_copy_buffer(
+      command_queue &cq,
+      memory_object_holder &src,
+      memory_object_holder &dst,
+      ptrdiff_t byte_count,
+      size_t src_offset,
+      size_t dst_offset,
+      py::object py_wait_for)
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+
+    if (byte_count < 0)
+    {
+      size_t byte_count_src = 0;
+      size_t byte_count_dst = 0;
+      PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
+          (src.data(), CL_MEM_SIZE, sizeof(byte_count), &byte_count_src, 0));
+      PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
+          (src.data(), CL_MEM_SIZE, sizeof(byte_count), &byte_count_dst, 0));
+      byte_count = std::min(byte_count_src, byte_count_dst);
+    }
+
+    cl_event evt;
+    PYOPENCL_RETRY_IF_MEM_ERROR(
+      PYOPENCL_CALL_GUARDED(clEnqueueCopyBuffer, (
+            cq.data(),
+            src.data(), dst.data(),
+            src_offset, dst_offset,
+            byte_count,
+            PYOPENCL_WAITLIST_ARGS,
+            &evt
+            ))
+      );
+
+    PYOPENCL_RETURN_NEW_EVENT(evt);
+  }
+
+  // }}}
+
+  // {{{ rectangular transfers
+#if PYOPENCL_CL_VERSION >= 0x1010
+  inline
+  event *enqueue_read_buffer_rect(
+      command_queue &cq,
+      memory_object_holder &mem,
+      py::object buffer,
+      py::object py_buffer_origin,
+      py::object py_host_origin,
+      py::object py_region,
+      py::sequence py_buffer_pitches,
+      py::sequence py_host_pitches,
+      py::object py_wait_for,
+      bool is_blocking
+      )
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+    COPY_PY_COORD_TRIPLE(buffer_origin);
+    COPY_PY_COORD_TRIPLE(host_origin);
+    COPY_PY_REGION_TRIPLE(region);
+    COPY_PY_PITCH_TUPLE(buffer_pitches);
+    COPY_PY_PITCH_TUPLE(host_pitches);
+
+    void *buf;
+
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+    std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
+
+    ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE);
+
+    buf = ward->m_buf.buf;
+#else
+    py::object ward = buffer;
+
+    PYOPENCL_BUFFER_SIZE_T len;
+    if (PyObject_AsWriteBuffer(buffer.ptr(), &buf, &len))
+      throw py::error_already_set();
+#endif
+
+    cl_event evt;
+    PYOPENCL_RETRY_IF_MEM_ERROR(
+      PYOPENCL_CALL_GUARDED_THREADED(clEnqueueReadBufferRect, (
+            cq.data(),
+            mem.data(),
+            PYOPENCL_CAST_BOOL(is_blocking),
+            buffer_origin, host_origin, region,
+            buffer_pitches[0], buffer_pitches[1],
+            host_pitches[0], host_pitches[1],
+            buf,
+            PYOPENCL_WAITLIST_ARGS, &evt
+            ))
+      );
+    PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, ward);
+  }
+
+
+
+
+  inline
+  event *enqueue_write_buffer_rect(
+      command_queue &cq,
+      memory_object_holder &mem,
+      py::object buffer,
+      py::object py_buffer_origin,
+      py::object py_host_origin,
+      py::object py_region,
+      py::sequence py_buffer_pitches,
+      py::sequence py_host_pitches,
+      py::object py_wait_for,
+      bool is_blocking
+      )
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+    COPY_PY_COORD_TRIPLE(buffer_origin);
+    COPY_PY_COORD_TRIPLE(host_origin);
+    COPY_PY_REGION_TRIPLE(region);
+    COPY_PY_PITCH_TUPLE(buffer_pitches);
+    COPY_PY_PITCH_TUPLE(host_pitches);
+
+    const void *buf;
+
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+    std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
+
+    ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS);
+
+    buf = ward->m_buf.buf;
+#else
+    py::object ward = buffer;
+    PYOPENCL_BUFFER_SIZE_T len;
+    if (PyObject_AsReadBuffer(buffer.ptr(), &buf, &len))
+      throw py::error_already_set();
+#endif
+
+    cl_event evt;
+    PYOPENCL_RETRY_IF_MEM_ERROR(
+      PYOPENCL_CALL_GUARDED_THREADED(clEnqueueWriteBufferRect, (
+            cq.data(),
+            mem.data(),
+            PYOPENCL_CAST_BOOL(is_blocking),
+            buffer_origin, host_origin, region,
+            buffer_pitches[0], buffer_pitches[1],
+            host_pitches[0], host_pitches[1],
+            buf,
+            PYOPENCL_WAITLIST_ARGS, &evt
+            ))
+      );
+    PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, ward);
+  }
+
+
+
+
+  inline
+  event *enqueue_copy_buffer_rect(
+      command_queue &cq,
+      memory_object_holder &src,
+      memory_object_holder &dst,
+      py::object py_src_origin,
+      py::object py_dst_origin,
+      py::object py_region,
+      py::sequence py_src_pitches,
+      py::sequence py_dst_pitches,
+      py::object py_wait_for)
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+    COPY_PY_COORD_TRIPLE(src_origin);
+    COPY_PY_COORD_TRIPLE(dst_origin);
+    COPY_PY_REGION_TRIPLE(region);
+    COPY_PY_PITCH_TUPLE(src_pitches);
+    COPY_PY_PITCH_TUPLE(dst_pitches);
+
+    cl_event evt;
+    PYOPENCL_RETRY_IF_MEM_ERROR(
+      PYOPENCL_CALL_GUARDED(clEnqueueCopyBufferRect, (
+            cq.data(),
+            src.data(), dst.data(),
+            src_origin, dst_origin, region,
+            src_pitches[0], src_pitches[1],
+            dst_pitches[0], dst_pitches[1],
+            PYOPENCL_WAITLIST_ARGS,
+            &evt
+            ))
+      );
+
+    PYOPENCL_RETURN_NEW_EVENT(evt);
+  }
+
+#endif
+
+  // }}}
+
+  // }}}
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+  inline
+  event *enqueue_fill_buffer(
+      command_queue &cq,
+      memory_object_holder &mem,
+      py::object pattern,
+      size_t offset,
+      size_t size,
+      py::object py_wait_for
+      )
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+
+    const void *pattern_buf;
+    PYOPENCL_BUFFER_SIZE_T pattern_len;
+
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+    std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
+
+    ward->get(pattern.ptr(), PyBUF_ANY_CONTIGUOUS);
+
+    pattern_buf = ward->m_buf.buf;
+    pattern_len = ward->m_buf.len;
+#else
+    if (PyObject_AsReadBuffer(pattern.ptr(), &pattern_buf, &pattern_len))
+      throw py::error_already_set();
+#endif
+
+    cl_event evt;
+    PYOPENCL_RETRY_IF_MEM_ERROR(
+      PYOPENCL_CALL_GUARDED(clEnqueueFillBuffer, (
+            cq.data(),
+            mem.data(),
+            pattern_buf, pattern_len, offset, size,
+            PYOPENCL_WAITLIST_ARGS, &evt
+            ))
+      );
+    PYOPENCL_RETURN_NEW_EVENT(evt);
+  }
+#endif
+
+  // }}}
+
+
+  // {{{ image
+
+  class image : public memory_object
+  {
+    public:
+      image(cl_mem mem, bool retain, hostbuf_t hostbuf=hostbuf_t())
+        : memory_object(mem, retain, PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(hostbuf))
+      { }
+
+      py::object get_image_info(cl_image_info param_name) const
+      {
+        switch (param_name)
+        {
+          case CL_IMAGE_FORMAT:
+            PYOPENCL_GET_INTEGRAL_INFO(Image, data(), param_name,
+                cl_image_format);
+          case CL_IMAGE_ELEMENT_SIZE:
+          case CL_IMAGE_ROW_PITCH:
+          case CL_IMAGE_SLICE_PITCH:
+          case CL_IMAGE_WIDTH:
+          case CL_IMAGE_HEIGHT:
+          case CL_IMAGE_DEPTH:
+#if PYOPENCL_CL_VERSION >= 0x1020
+          case CL_IMAGE_ARRAY_SIZE:
+#endif
+            PYOPENCL_GET_INTEGRAL_INFO(Image, data(), param_name, size_t);
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+          case CL_IMAGE_BUFFER:
+            {
+              cl_mem param_value;
+              PYOPENCL_CALL_GUARDED(clGetImageInfo, \
+                  (data(), param_name, sizeof(param_value), &param_value, 0));
+              if (param_value == 0)
+              {
+                // no associated memory object? no problem.
+                return py::none();
+              }
+
+              return create_mem_object_wrapper(param_value, /* retain */ true);
+            }
+
+          case CL_IMAGE_NUM_MIP_LEVELS:
+          case CL_IMAGE_NUM_SAMPLES:
+            PYOPENCL_GET_INTEGRAL_INFO(Image, data(), param_name, cl_uint);
+#endif
+
+          default:
+            throw error("MemoryObject.get_image_info", CL_INVALID_VALUE);
+        }
+      }
+  };
+
+
+
+
+  // {{{ image formats
+
+  inline
+  cl_image_format *make_image_format(cl_channel_order ord, cl_channel_type tp)
+  {
+    std::unique_ptr<cl_image_format> result(new cl_image_format);
+    result->image_channel_order = ord;
+    result->image_channel_data_type = tp;
+    return result.release();
+  }
+
+  inline
+  py::list get_supported_image_formats(
+      context const &ctx,
+      cl_mem_flags flags,
+      cl_mem_object_type image_type)
+  {
+    cl_uint num_image_formats;
+    PYOPENCL_CALL_GUARDED(clGetSupportedImageFormats, (
+          ctx.data(), flags, image_type,
+          0, nullptr, &num_image_formats));
+
+    std::vector<cl_image_format> formats(num_image_formats);
+    PYOPENCL_CALL_GUARDED(clGetSupportedImageFormats, (
+          ctx.data(), flags, image_type,
+          formats.size(), formats.empty( ) ? nullptr : &formats.front(), nullptr));
+
+    PYOPENCL_RETURN_VECTOR(cl_image_format, formats);
+  }
+
+  inline
+  cl_uint get_image_format_channel_count(cl_image_format const &fmt)
+  {
+    switch (fmt.image_channel_order)
+    {
+      case CL_R: return 1;
+      case CL_A: return 1;
+      case CL_RG: return 2;
+      case CL_RA: return 2;
+      case CL_RGB: return 3;
+      case CL_RGBA: return 4;
+      case CL_BGRA: return 4;
+      case CL_INTENSITY: return 1;
+      case CL_LUMINANCE: return 1;
+      default:
+        throw pyopencl::error("ImageFormat.channel_dtype_size",
+            CL_INVALID_VALUE,
+            "unrecognized channel order");
+    }
+  }
+
+  inline
+  cl_uint get_image_format_channel_dtype_size(cl_image_format const &fmt)
+  {
+    switch (fmt.image_channel_data_type)
+    {
+      case CL_SNORM_INT8: return 1;
+      case CL_SNORM_INT16: return 2;
+      case CL_UNORM_INT8: return 1;
+      case CL_UNORM_INT16: return 2;
+      case CL_UNORM_SHORT_565: return 2;
+      case CL_UNORM_SHORT_555: return 2;
+      case CL_UNORM_INT_101010: return 4;
+      case CL_SIGNED_INT8: return 1;
+      case CL_SIGNED_INT16: return 2;
+      case CL_SIGNED_INT32: return 4;
+      case CL_UNSIGNED_INT8: return 1;
+      case CL_UNSIGNED_INT16: return 2;
+      case CL_UNSIGNED_INT32: return 4;
+      case CL_HALF_FLOAT: return 2;
+      case CL_FLOAT: return 4;
+      default:
+        throw pyopencl::error("ImageFormat.channel_dtype_size",
+            CL_INVALID_VALUE,
+            "unrecognized channel data type");
+    }
+  }
+
+  inline
+  cl_uint get_image_format_item_size(cl_image_format const &fmt)
+  {
+    return get_image_format_channel_count(fmt)
+      * get_image_format_channel_dtype_size(fmt);
+  }
+
+  // }}}
+
+  // {{{ image creation
+
+  inline
+  image *create_image(
+      context const &ctx,
+      cl_mem_flags flags,
+      cl_image_format const &fmt,
+      py::sequence shape,
+      py::sequence pitches,
+      py::object buffer)
+  {
+    if (shape.ptr() == Py_None)
+      throw pyopencl::error("Image", CL_INVALID_VALUE,
+          "'shape' must be given");
+
+    void *buf = 0;
+    PYOPENCL_BUFFER_SIZE_T len = 0;
+
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+    std::unique_ptr<py_buffer_wrapper> retained_buf_obj;
+    if (buffer.ptr() != Py_None)
+    {
+      retained_buf_obj = std::unique_ptr<py_buffer_wrapper>(new py_buffer_wrapper);
+
+      int py_buf_flags = PyBUF_ANY_CONTIGUOUS;
+      if ((flags & CL_MEM_USE_HOST_PTR)
+          && ((flags & CL_MEM_READ_WRITE)
+            || (flags & CL_MEM_WRITE_ONLY)))
+        py_buf_flags |= PyBUF_WRITABLE;
+
+      retained_buf_obj->get(buffer.ptr(), py_buf_flags);
+
+      buf = retained_buf_obj->m_buf.buf;
+      len = retained_buf_obj->m_buf.len;
+    }
+#else
+    py::object retained_buf_obj;
+    if (buffer.ptr() != Py_None)
+    {
+      if ((flags & CL_MEM_USE_HOST_PTR)
+          && ((flags & CL_MEM_READ_WRITE)
+            || (flags & CL_MEM_WRITE_ONLY)))
+      {
+        if (PyObject_AsWriteBuffer(buffer.ptr(), &buf, &len))
+          throw py::error_already_set();
+      }
+      else
+      {
+        if (PyObject_AsReadBuffer(
+              buffer.ptr(), const_cast<const void **>(&buf), &len))
+          throw py::error_already_set();
+      }
+
+      if (flags & CL_MEM_USE_HOST_PTR)
+        retained_buf_obj = buffer;
+    }
+#endif
+
+    unsigned dims = py::len(shape);
+    cl_int status_code;
+    cl_mem mem;
+    if (dims == 2)
+    {
+      size_t width = (shape[0]).cast<size_t>();
+      size_t height = (shape[1]).cast<size_t>();
+
+      size_t pitch = 0;
+      if (pitches.ptr() != Py_None)
+      {
+        if (py::len(pitches) != 1)
+          throw pyopencl::error("Image", CL_INVALID_VALUE,
+              "invalid length of pitch tuple");
+        pitch = (pitches[0]).cast<size_t>();
+      }
+
+      // check buffer size
+      cl_int itemsize = get_image_format_item_size(fmt);
+      if (buf && std::max(pitch, width*itemsize)*height > cl_uint(len))
+          throw pyopencl::error("Image", CL_INVALID_VALUE,
+              "buffer too small");
+
+      PYOPENCL_PRINT_CALL_TRACE("clCreateImage2D");
+      PYOPENCL_RETRY_IF_MEM_ERROR(
+          {
+            mem = clCreateImage2D(ctx.data(), flags, &fmt,
+                width, height, pitch, buf, &status_code);
+            if (status_code != CL_SUCCESS)
+              throw pyopencl::error("clCreateImage2D", status_code);
+          } );
+
+    }
+    else if (dims == 3)
+    {
+      size_t width = (shape[0]).cast<size_t>();
+      size_t height = (shape[1]).cast<size_t>();
+      size_t depth = (shape[2]).cast<size_t>();
+
+      size_t pitch_x = 0;
+      size_t pitch_y = 0;
+
+      if (pitches.ptr() != Py_None)
+      {
+        if (py::len(pitches) != 2)
+          throw pyopencl::error("Image", CL_INVALID_VALUE,
+              "invalid length of pitch tuple");
+
+        pitch_x = (pitches[0]).cast<size_t>();
+        pitch_y = (pitches[1]).cast<size_t>();
+      }
+
+      // check buffer size
+      cl_int itemsize = get_image_format_item_size(fmt);
+      if (buf &&
+          std::max(std::max(pitch_x, width*itemsize)*height, pitch_y)
+          * depth > cl_uint(len))
+        throw pyopencl::error("Image", CL_INVALID_VALUE,
+            "buffer too small");
+
+      PYOPENCL_PRINT_CALL_TRACE("clCreateImage3D");
+      PYOPENCL_RETRY_IF_MEM_ERROR(
+          {
+            mem = clCreateImage3D(ctx.data(), flags, &fmt,
+              width, height, depth, pitch_x, pitch_y, buf, &status_code);
+            if (status_code != CL_SUCCESS)
+              throw pyopencl::error("clCreateImage3D", status_code);
+          } );
+    }
+    else
+      throw pyopencl::error("Image", CL_INVALID_VALUE,
+          "invalid dimension");
+
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+    if (!(flags & CL_MEM_USE_HOST_PTR))
+      retained_buf_obj.reset();
+#endif
+
+    try
+    {
+      return new image(mem, false, PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(retained_buf_obj));
+    }
+    catch (...)
+    {
+      PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem));
+      throw;
+    }
+  }
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+
+  inline
+  image *create_image_from_desc(
+      context const &ctx,
+      cl_mem_flags flags,
+      cl_image_format const &fmt,
+      cl_image_desc &desc,
+      py::object buffer)
+  {
+    if (buffer.ptr() != Py_None &&
+        !(flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)))
+      PyErr_Warn(PyExc_UserWarning, "'hostbuf' was passed, "
+          "but no memory flags to make use of it.");
+
+    void *buf = 0;
+
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+    std::unique_ptr<py_buffer_wrapper> retained_buf_obj;
+    if (buffer.ptr() != Py_None)
+    {
+      retained_buf_obj = std::unique_ptr<py_buffer_wrapper>(new py_buffer_wrapper);
+
+      int py_buf_flags = PyBUF_ANY_CONTIGUOUS;
+      if ((flags & CL_MEM_USE_HOST_PTR)
+          && ((flags & CL_MEM_READ_WRITE)
+            || (flags & CL_MEM_WRITE_ONLY)))
+        py_buf_flags |= PyBUF_WRITABLE;
+
+      retained_buf_obj->get(buffer.ptr(), py_buf_flags);
+
+      buf = retained_buf_obj->m_buf.buf;
+    }
+#else
+    py::object retained_buf_obj;
+    PYOPENCL_BUFFER_SIZE_T len;
+    if (buffer.ptr() != Py_None)
+    {
+      if ((flags & CL_MEM_USE_HOST_PTR)
+          && ((flags & CL_MEM_READ_WRITE)
+            || (flags & CL_MEM_WRITE_ONLY)))
+      {
+        if (PyObject_AsWriteBuffer(buffer.ptr(), &buf, &len))
+          throw py::error_already_set();
+      }
+      else
+      {
+        if (PyObject_AsReadBuffer(
+              buffer.ptr(), const_cast<const void **>(&buf), &len))
+          throw py::error_already_set();
+      }
+
+      if (flags & CL_MEM_USE_HOST_PTR)
+        retained_buf_obj = buffer;
+    }
+#endif
+
+    PYOPENCL_PRINT_CALL_TRACE("clCreateImage");
+    cl_int status_code;
+    cl_mem mem = clCreateImage(ctx.data(), flags, &fmt, &desc, buf, &status_code);
+    if (status_code != CL_SUCCESS)
+      throw pyopencl::error("clCreateImage", status_code);
+
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+    if (!(flags & CL_MEM_USE_HOST_PTR))
+      retained_buf_obj.reset();
+#endif
+
+    try
+    {
+      return new image(mem, false, PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(retained_buf_obj));
+    }
+    catch (...)
+    {
+      PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem));
+      throw;
+    }
+  }
+
+#endif
+
+  // }}}
+
+  // {{{ image transfers
+
+  inline
+  event *enqueue_read_image(
+      command_queue &cq,
+      image &img,
+      py::object py_origin, py::object py_region,
+      py::object buffer,
+      size_t row_pitch, size_t slice_pitch,
+      py::object py_wait_for,
+      bool is_blocking)
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+    COPY_PY_COORD_TRIPLE(origin);
+    COPY_PY_REGION_TRIPLE(region);
+
+    void *buf;
+
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+    std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
+
+    ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE);
+
+    buf = ward->m_buf.buf;
+#else
+    py::object ward = buffer;
+    PYOPENCL_BUFFER_SIZE_T len;
+    if (PyObject_AsWriteBuffer(buffer.ptr(), &buf, &len))
+      throw py::error_already_set();
+#endif
+
+    cl_event evt;
+
+    PYOPENCL_RETRY_IF_MEM_ERROR(
+      PYOPENCL_CALL_GUARDED(clEnqueueReadImage, (
+            cq.data(),
+            img.data(),
+            PYOPENCL_CAST_BOOL(is_blocking),
+            origin, region, row_pitch, slice_pitch, buf,
+            PYOPENCL_WAITLIST_ARGS, &evt
+            ));
+      );
+    PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, ward);
+  }
+
+
+
+
+  inline
+  event *enqueue_write_image(
+      command_queue &cq,
+      image &img,
+      py::object py_origin, py::object py_region,
+      py::object buffer,
+      size_t row_pitch, size_t slice_pitch,
+      py::object py_wait_for,
+      bool is_blocking)
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+    COPY_PY_COORD_TRIPLE(origin);
+    COPY_PY_REGION_TRIPLE(region);
+
+    const void *buf;
+
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+    std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
+
+    ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS);
+
+    buf = ward->m_buf.buf;
+#else
+    py::object ward = buffer;
+    PYOPENCL_BUFFER_SIZE_T len;
+    if (PyObject_AsReadBuffer(buffer.ptr(), &buf, &len))
+      throw py::error_already_set();
+#endif
+
+    cl_event evt;
+    PYOPENCL_RETRY_IF_MEM_ERROR(
+      PYOPENCL_CALL_GUARDED(clEnqueueWriteImage, (
+            cq.data(),
+            img.data(),
+            PYOPENCL_CAST_BOOL(is_blocking),
+            origin, region, row_pitch, slice_pitch, buf,
+            PYOPENCL_WAITLIST_ARGS, &evt
+            ));
+      );
+    PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, ward);
+  }
+
+
+
+
+  inline
+  event *enqueue_copy_image(
+      command_queue &cq,
+      memory_object_holder &src,
+      memory_object_holder &dest,
+      py::object py_src_origin,
+      py::object py_dest_origin,
+      py::object py_region,
+      py::object py_wait_for
+      )
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+    COPY_PY_COORD_TRIPLE(src_origin);
+    COPY_PY_COORD_TRIPLE(dest_origin);
+    COPY_PY_REGION_TRIPLE(region);
+
+    cl_event evt;
+    PYOPENCL_RETRY_IF_MEM_ERROR(
+      PYOPENCL_CALL_GUARDED(clEnqueueCopyImage, (
+            cq.data(), src.data(), dest.data(),
+            src_origin, dest_origin, region,
+            PYOPENCL_WAITLIST_ARGS, &evt
+            ));
+      );
+    PYOPENCL_RETURN_NEW_EVENT(evt);
+  }
+
+
+
+
+  inline
+  event *enqueue_copy_image_to_buffer(
+      command_queue &cq,
+      memory_object_holder &src,
+      memory_object_holder &dest,
+      py::object py_origin,
+      py::object py_region,
+      size_t offset,
+      py::object py_wait_for
+      )
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+    COPY_PY_COORD_TRIPLE(origin);
+    COPY_PY_REGION_TRIPLE(region);
+
+    cl_event evt;
+    PYOPENCL_RETRY_IF_MEM_ERROR(
+      PYOPENCL_CALL_GUARDED(clEnqueueCopyImageToBuffer, (
+            cq.data(), src.data(), dest.data(),
+            origin, region, offset,
+            PYOPENCL_WAITLIST_ARGS, &evt
+            ));
+      );
+    PYOPENCL_RETURN_NEW_EVENT(evt);
+  }
+
+
+
+
+  inline
+  event *enqueue_copy_buffer_to_image(
+      command_queue &cq,
+      memory_object_holder &src,
+      memory_object_holder &dest,
+      size_t offset,
+      py::object py_origin,
+      py::object py_region,
+      py::object py_wait_for
+      )
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+    COPY_PY_COORD_TRIPLE(origin);
+    COPY_PY_REGION_TRIPLE(region);
+
+    cl_event evt;
+    PYOPENCL_RETRY_IF_MEM_ERROR(
+      PYOPENCL_CALL_GUARDED(clEnqueueCopyBufferToImage, (
+            cq.data(), src.data(), dest.data(),
+            offset, origin, region,
+            PYOPENCL_WAITLIST_ARGS, &evt
+            ));
+      );
+    PYOPENCL_RETURN_NEW_EVENT(evt);
+  }
+
+  // }}}
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+  inline
+  event *enqueue_fill_image(
+      command_queue &cq,
+      memory_object_holder &mem,
+      py::object color,
+      py::object py_origin, py::object py_region,
+      py::object py_wait_for
+      )
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+
+    COPY_PY_COORD_TRIPLE(origin);
+    COPY_PY_REGION_TRIPLE(region);
+
+    const void *color_buf;
+
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+    std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);
+
+    ward->get(color.ptr(), PyBUF_ANY_CONTIGUOUS);
+
+    color_buf = ward->m_buf.buf;
+#else
+    PYOPENCL_BUFFER_SIZE_T color_len;
+    if (PyObject_AsReadBuffer(color.ptr(), &color_buf, &color_len))
+      throw py::error_already_set();
+#endif
+
+    cl_event evt;
+    PYOPENCL_RETRY_IF_MEM_ERROR(
+      PYOPENCL_CALL_GUARDED(clEnqueueFillImage, (
+            cq.data(),
+            mem.data(),
+            color_buf, origin, region,
+            PYOPENCL_WAITLIST_ARGS, &evt
+            ));
+      );
+    PYOPENCL_RETURN_NEW_EVENT(evt);
+  }
+#endif
+
+  // }}}
+
+
+  // {{{ maps
+  class memory_map
+  {
+    private:
+      bool m_valid;
+      std::shared_ptr<command_queue> m_queue;
+      memory_object m_mem;
+      void *m_ptr;
+
+    public:
+      memory_map(std::shared_ptr<command_queue> cq, memory_object const &mem, void *ptr)
+        : m_valid(true), m_queue(cq), m_mem(mem), m_ptr(ptr)
+      {
+      }
+
+      ~memory_map()
+      {
+        if (m_valid)
+          delete release(0, py::none());
+      }
+
+      event *release(command_queue *cq, py::object py_wait_for)
+      {
+        PYOPENCL_PARSE_WAIT_FOR;
+
+        if (cq == 0)
+          cq = m_queue.get();
+
+        cl_event evt;
+        PYOPENCL_CALL_GUARDED(clEnqueueUnmapMemObject, (
+              cq->data(), m_mem.data(), m_ptr,
+              PYOPENCL_WAITLIST_ARGS, &evt
+              ));
+
+        m_valid = false;
+
+        PYOPENCL_RETURN_NEW_EVENT(evt);
+      }
+  };
+
+
+
+
+  // FIXME: Reenable in pypy
+#ifndef PYPY_VERSION
+  inline
+  py::object enqueue_map_buffer(
+      std::shared_ptr<command_queue> cq,
+      memory_object_holder &buf,
+      cl_map_flags flags,
+      size_t offset,
+      py::object py_shape, py::object dtype,
+      py::object py_order, py::object py_strides,
+      py::object py_wait_for,
+      bool is_blocking
+      )
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+    PYOPENCL_PARSE_NUMPY_ARRAY_SPEC;
+
+    npy_uintp size_in_bytes = tp_descr->elsize;
+    for (npy_intp sdim: shape)
+      size_in_bytes *= sdim;
+
+    py::object result;
+
+    cl_event evt;
+    cl_int status_code;
+    PYOPENCL_PRINT_CALL_TRACE("clEnqueueMapBuffer");
+    void *mapped;
+
+    PYOPENCL_RETRY_IF_MEM_ERROR(
+        {
+          {
+            py::gil_scoped_release release;
+            mapped = clEnqueueMapBuffer(
+                  cq->data(), buf.data(),
+                  PYOPENCL_CAST_BOOL(is_blocking), flags,
+                  offset, size_in_bytes,
+                  PYOPENCL_WAITLIST_ARGS, &evt,
+                  &status_code);
+          }
+          if (status_code != CL_SUCCESS)
+            throw pyopencl::error("clEnqueueMapBuffer", status_code);
+        } );
+
+    event evt_handle(evt, false);
+
+    std::unique_ptr<memory_map> map;
+    try
+    {
+      result = py::object(py::reinterpret_steal<py::object>(PyArray_NewFromDescr(
+          &PyArray_Type, tp_descr,
+          shape.size(),
+          shape.empty() ? nullptr : &shape.front(),
+          strides.empty() ? nullptr : &strides.front(),
+          mapped, ary_flags, /*obj*/nullptr)));
+
+      if (size_in_bytes != (npy_uintp) PyArray_NBYTES(result.ptr()))
+        throw pyopencl::error("enqueue_map_buffer", CL_INVALID_VALUE,
+            "miscalculated numpy array size (not contiguous?)");
+
+       map = std::unique_ptr<memory_map>(new memory_map(cq, buf, mapped));
+    }
+    catch (...)
+    {
+      PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueUnmapMemObject, (
+            cq->data(), buf.data(), mapped, 0, 0, 0));
+      throw;
+    }
+
+    py::object map_py(handle_from_new_ptr(map.release()));
+    PyArray_BASE(result.ptr()) = map_py.ptr();
+    Py_INCREF(map_py.ptr());
+
+    return py::make_tuple(
+        result,
+        handle_from_new_ptr(new event(evt_handle)));
+  }
+#endif
+
+
+
+
+  // FIXME: Reenable in pypy
+#ifndef PYPY_VERSION
+  inline
+  py::object enqueue_map_image(
+      std::shared_ptr<command_queue> cq,
+      memory_object_holder &img,
+      cl_map_flags flags,
+      py::object py_origin,
+      py::object py_region,
+      py::object py_shape, py::object dtype,
+      py::object py_order, py::object py_strides,
+      py::object py_wait_for,
+      bool is_blocking
+      )
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+    PYOPENCL_PARSE_NUMPY_ARRAY_SPEC;
+    COPY_PY_COORD_TRIPLE(origin);
+    COPY_PY_REGION_TRIPLE(region);
+
+    cl_event evt;
+    cl_int status_code;
+    PYOPENCL_PRINT_CALL_TRACE("clEnqueueMapImage");
+    size_t row_pitch, slice_pitch;
+    void *mapped;
+    PYOPENCL_RETRY_IF_MEM_ERROR(
+      {
+        {
+          py::gil_scoped_release release;
+          mapped = clEnqueueMapImage(
+                cq->data(), img.data(),
+                PYOPENCL_CAST_BOOL(is_blocking), flags,
+                origin, region, &row_pitch, &slice_pitch,
+                PYOPENCL_WAITLIST_ARGS, &evt,
+                &status_code);
+        }
+        if (status_code != CL_SUCCESS)
+          throw pyopencl::error("clEnqueueMapImage", status_code);
+      } );
+
+    event evt_handle(evt, false);
+
+    std::unique_ptr<memory_map> map;
+    try
+    {
+       map = std::unique_ptr<memory_map>(new memory_map(cq, img, mapped));
+    }
+    catch (...)
+    {
+      PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueUnmapMemObject, (
+            cq->data(), img.data(), mapped, 0, 0, 0));
+      throw;
+    }
+
+    py::object result = py::reinterpret_steal<py::object>(PyArray_NewFromDescr(
+        &PyArray_Type, tp_descr,
+        shape.size(),
+        shape.empty() ? nullptr : &shape.front(),
+        strides.empty() ? nullptr : &strides.front(),
+        mapped, ary_flags, /*obj*/nullptr));
+
+    py::object map_py(handle_from_new_ptr(map.release()));
+    PyArray_BASE(result.ptr()) = map_py.ptr();
+    Py_INCREF(map_py.ptr());
+
+    return py::make_tuple(
+        result,
+        handle_from_new_ptr(new event(evt_handle)),
+        row_pitch, slice_pitch);
+  }
+#endif
+
+  // }}}
+
+
+  // {{{ svm
+
+#if PYOPENCL_CL_VERSION >= 0x2000
+
+  class svm_arg_wrapper
+  {
+    private:
+      void *m_ptr;
+      PYOPENCL_BUFFER_SIZE_T m_size;
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+        std::unique_ptr<py_buffer_wrapper> ward;
+#endif
+
+    public:
+      svm_arg_wrapper(py::object holder)
+      {
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+        ward = std::unique_ptr<py_buffer_wrapper>(new py_buffer_wrapper);
+#ifdef PYPY_VERSION
+        // FIXME: get a read-only buffer
+        // Not quite honest, but Pypy doesn't consider numpy arrays
+        // created from objects with the __aray_interface__ writeable.
+        ward->get(holder.ptr(), PyBUF_ANY_CONTIGUOUS);
+#else
+        ward->get(holder.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE);
+#endif
+        m_ptr = ward->m_buf.buf;
+        m_size = ward->m_buf.len;
+#else
+        py::object ward = holder;
+        if (PyObject_AsWriteBuffer(holder.ptr(), &m_ptr, &m_size))
+          throw py::error_already_set();
+#endif
+      }
+
+      void *ptr() const
+      {
+        return m_ptr;
+      }
+      size_t size() const
+      {
+        return m_size;
+      }
+  };
+
+
+  class svm_allocation : noncopyable
+  {
+    private:
+      std::shared_ptr<context> m_context;
+      void *m_allocation;
+
+    public:
+      svm_allocation(std::shared_ptr<context> const &ctx, size_t size, cl_uint alignment, cl_svm_mem_flags flags)
+        : m_context(ctx)
+      {
+        PYOPENCL_PRINT_CALL_TRACE("clSVMalloc");
+        m_allocation = clSVMAlloc(
+            ctx->data(),
+            flags, size, alignment);
+
+        if (!m_allocation)
+          throw pyopencl::error("clSVMAlloc", CL_OUT_OF_RESOURCES);
+      }
+
+      ~svm_allocation()
+      {
+        if (m_allocation)
+          release();
+      }
+
+      void release()
+      {
+        if (!m_allocation)
+          throw error("SVMAllocation.release", CL_INVALID_VALUE,
+              "trying to double-unref svm allocation");
+
+        clSVMFree(m_context->data(), m_allocation);
+        m_allocation = nullptr;
+      }
+
+      void enqueue_release(command_queue &queue, py::object py_wait_for)
+      {
+        PYOPENCL_PARSE_WAIT_FOR;
+
+        if (!m_allocation)
+          throw error("SVMAllocation.release", CL_INVALID_VALUE,
+              "trying to double-unref svm allocation");
+
+        cl_event evt;
+
+        PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueSVMFree, (
+              queue.data(), 1, &m_allocation,
+              nullptr, nullptr,
+              PYOPENCL_WAITLIST_ARGS, &evt));
+
+        m_allocation = nullptr;
+      }
+
+      void *ptr() const
+      {
+        return m_allocation;
+      }
+
+      intptr_t ptr_as_int() const
+      {
+        return (intptr_t) m_allocation;
+      }
+
+      bool operator==(svm_allocation const &other) const
+      {
+        return m_allocation == other.m_allocation;
+      }
+
+      bool operator!=(svm_allocation const &other) const
+      {
+        return m_allocation != other.m_allocation;
+      }
+  };
+
+
+  inline
+  event *enqueue_svm_memcpy(
+      command_queue &cq,
+      cl_bool is_blocking,
+      svm_arg_wrapper &dst, svm_arg_wrapper &src,
+      py::object py_wait_for
+      )
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+
+    if (src.size() != dst.size())
+      throw error("_enqueue_svm_memcpy", CL_INVALID_VALUE,
+          "sizes of source and destination buffer do not match");
+
+    cl_event evt;
+    PYOPENCL_CALL_GUARDED(
+        clEnqueueSVMMemcpy,
+        (
+          cq.data(),
+          is_blocking,
+          dst.ptr(), src.ptr(),
+          dst.size(),
+          PYOPENCL_WAITLIST_ARGS,
+          &evt
+        ));
+
+    PYOPENCL_RETURN_NEW_EVENT(evt);
+  }
+
+
+  inline
+  event *enqueue_svm_memfill(
+      command_queue &cq,
+      svm_arg_wrapper &dst, py::object py_pattern,
+      py::object byte_count,
+      py::object py_wait_for
+      )
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+
+    const void *pattern_ptr;
+    PYOPENCL_BUFFER_SIZE_T pattern_len;
+
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+    std::unique_ptr<py_buffer_wrapper> pattern_ward(new py_buffer_wrapper);
+
+    pattern_ward->get(py_pattern.ptr(), PyBUF_ANY_CONTIGUOUS);
+
+    pattern_ptr = pattern_ward->m_buf.buf;
+    pattern_len = pattern_ward->m_buf.len;
+#else
+    py::object pattern_ward = py_pattern;
+    if (PyObject_AsReadBuffer(py_pattern.ptr(), &pattern_ptr, &pattern_len))
+      throw py::error_already_set();
+#endif
+
+    size_t fill_size = dst.size();
+    if (!byte_count.is_none())
+      fill_size = py::cast<size_t>(byte_count);
+
+    cl_event evt;
+    PYOPENCL_CALL_GUARDED(
+        clEnqueueSVMMemFill,
+        (
+          cq.data(),
+          dst.ptr(), pattern_ptr,
+          pattern_len,
+          fill_size,
+          PYOPENCL_WAITLIST_ARGS,
+          &evt
+        ));
+
+    PYOPENCL_RETURN_NEW_EVENT(evt);
+  }
+
+
+  inline
+  event *enqueue_svm_map(
+      command_queue &cq,
+      cl_bool is_blocking,
+      cl_map_flags flags,
+      svm_arg_wrapper &svm,
+      py::object py_wait_for
+      )
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+
+    cl_event evt;
+    PYOPENCL_CALL_GUARDED(
+        clEnqueueSVMMap,
+        (
+          cq.data(),
+          is_blocking,
+          flags,
+          svm.ptr(), svm.size(),
+          PYOPENCL_WAITLIST_ARGS,
+          &evt
+        ));
+
+    PYOPENCL_RETURN_NEW_EVENT(evt);
+  }
+
+
+  inline
+  event *enqueue_svm_unmap(
+      command_queue &cq,
+      svm_arg_wrapper &svm,
+      py::object py_wait_for
+      )
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+
+    cl_event evt;
+    PYOPENCL_CALL_GUARDED(
+        clEnqueueSVMUnmap,
+        (
+          cq.data(),
+          svm.ptr(),
+          PYOPENCL_WAITLIST_ARGS,
+          &evt
+        ));
+
+    PYOPENCL_RETURN_NEW_EVENT(evt);
+  }
+#endif
+
+
+#if PYOPENCL_CL_VERSION >= 0x2010
+  inline
+  event *enqueue_svm_migratemem(
+      command_queue &cq,
+      py::sequence svms,
+      cl_mem_migration_flags flags,
+      py::object py_wait_for
+      )
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+
+    std::vector<const void *> svm_pointers;
+    std::vector<size_t> sizes;
+
+    for (py::handle py_svm: svms)
+    {
+      svm_arg_wrapper &svm(py::cast<svm_arg_wrapper &>(py_svm));
+
+      svm_pointers.push_back(svm.ptr());
+      sizes.push_back(svm.size());
+    }
+
+    cl_event evt;
+    PYOPENCL_CALL_GUARDED(
+        clEnqueueSVMMigrateMem,
+        (
+         cq.data(),
+         svm_pointers.size(),
+         svm_pointers.empty() ? nullptr : &svm_pointers.front(),
+         sizes.empty() ? nullptr : &sizes.front(),
+         flags,
+         PYOPENCL_WAITLIST_ARGS,
+         &evt
+        ));
+
+    PYOPENCL_RETURN_NEW_EVENT(evt);
+  }
+#endif
+
+  // }}}
+
+
+  // {{{ sampler
+
+  class sampler : noncopyable
+  {
+    private:
+      cl_sampler m_sampler;
+
+    public:
+#if PYOPENCL_CL_VERSION >= 0x2000
+      sampler(context const &ctx, py::sequence py_props)
+      {
+        int hex_plat_version = ctx.get_hex_platform_version();
+
+        if (hex_plat_version  < 0x2000)
+        {
+          std::cerr <<
+            "sampler properties given as an iterable, "
+            "which uses an OpenCL 2+-only interface, "
+            "but the context's platform does not "
+            "declare OpenCL 2 support. Proceeding "
+            "as requested, but the next thing you see "
+            "may be a crash." << std:: endl;
+        }
+
+        cl_sampler_properties props[py::len(py_props) + 1];
+        {
+          size_t i = 0;
+          for (auto prop: py_props)
+            props[i++] = py::cast<cl_sampler_properties>(prop);
+          props[i++] = 0;
+        }
+
+        cl_int status_code;
+        PYOPENCL_PRINT_CALL_TRACE("clCreateSamplerWithProperties");
+
+        m_sampler = clCreateSamplerWithProperties(
+            ctx.data(),
+            props,
+            &status_code);
+
+        if (status_code != CL_SUCCESS)
+          throw pyopencl::error("Sampler", status_code);
+      }
+#endif
+
+      sampler(context const &ctx, bool normalized_coordinates,
+          cl_addressing_mode am, cl_filter_mode fm)
+      {
+        PYOPENCL_PRINT_CALL_TRACE("clCreateSampler");
+
+        int hex_plat_version = ctx.get_hex_platform_version();
+#if PYOPENCL_CL_VERSION >= 0x2000
+        if (hex_plat_version  >= 0x2000)
+        {
+            cl_sampler_properties props_list[] = {
+              CL_SAMPLER_NORMALIZED_COORDS, normalized_coordinates,
+              CL_SAMPLER_ADDRESSING_MODE, am,
+              CL_SAMPLER_FILTER_MODE, fm,
+              0,
+            };
+
+            cl_int status_code;
+
+            PYOPENCL_PRINT_CALL_TRACE("clCreateSamplerWithProperties");
+            m_sampler = clCreateSamplerWithProperties(
+                ctx.data(), props_list, &status_code);
+
+            if (status_code != CL_SUCCESS)
+              throw pyopencl::error("Sampler", status_code);
+        }
+        else
+#endif
+        {
+          cl_int status_code;
+
+#if defined(__GNUG__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+          m_sampler = clCreateSampler(
+              ctx.data(),
+              normalized_coordinates,
+              am, fm, &status_code);
+#if defined(__GNUG__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
+          if (status_code != CL_SUCCESS)
+            throw pyopencl::error("Sampler", status_code);
+        }
+      }
+
+      sampler(cl_sampler samp, bool retain)
+        : m_sampler(samp)
+      {
+        if (retain)
+          PYOPENCL_CALL_GUARDED(clRetainSampler, (samp));
+      }
+
+      ~sampler()
+      {
+        PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseSampler, (m_sampler));
+      }
+
+      cl_sampler data() const
+      {
+        return m_sampler;
+      }
+
+      PYOPENCL_EQUALITY_TESTS(sampler);
+
+      py::object get_info(cl_sampler_info param_name) const
+      {
+        switch (param_name)
+        {
+          case CL_SAMPLER_REFERENCE_COUNT:
+            PYOPENCL_GET_INTEGRAL_INFO(Sampler, m_sampler, param_name,
+                cl_uint);
+          case CL_SAMPLER_CONTEXT:
+            PYOPENCL_GET_OPAQUE_INFO(Sampler, m_sampler, param_name,
+                cl_context, context);
+          case CL_SAMPLER_ADDRESSING_MODE:
+            PYOPENCL_GET_INTEGRAL_INFO(Sampler, m_sampler, param_name,
+                cl_addressing_mode);
+          case CL_SAMPLER_FILTER_MODE:
+            PYOPENCL_GET_INTEGRAL_INFO(Sampler, m_sampler, param_name,
+                cl_filter_mode);
+          case CL_SAMPLER_NORMALIZED_COORDS:
+            PYOPENCL_GET_INTEGRAL_INFO(Sampler, m_sampler, param_name,
+                cl_bool);
+
+          default:
+            throw error("Sampler.get_info", CL_INVALID_VALUE);
+        }
+      }
+  };
+
+  // }}}
+
+
+  // {{{ program
+
+  class program : noncopyable
+  {
+    public:
+      enum program_kind_type { KND_UNKNOWN, KND_SOURCE, KND_BINARY };
+
+    private:
+      cl_program m_program;
+      program_kind_type m_program_kind;
+
+    public:
+      program(cl_program prog, bool retain, program_kind_type progkind=KND_UNKNOWN)
+        : m_program(prog), m_program_kind(progkind)
+      {
+        if (retain)
+          PYOPENCL_CALL_GUARDED(clRetainProgram, (prog));
+      }
+
+      ~program()
+      {
+        PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseProgram, (m_program));
+      }
+
+      cl_program data() const
+      {
+        return m_program;
+      }
+
+      program_kind_type kind() const
+      {
+        return m_program_kind;
+      }
+
+      PYOPENCL_EQUALITY_TESTS(program);
+
+      py::object get_info(cl_program_info param_name) const
+      {
+        switch (param_name)
+        {
+          case CL_PROGRAM_REFERENCE_COUNT:
+            PYOPENCL_GET_INTEGRAL_INFO(Program, m_program, param_name,
+                cl_uint);
+          case CL_PROGRAM_CONTEXT:
+            PYOPENCL_GET_OPAQUE_INFO(Program, m_program, param_name,
+                cl_context, context);
+          case CL_PROGRAM_NUM_DEVICES:
+            PYOPENCL_GET_INTEGRAL_INFO(Program, m_program, param_name,
+                cl_uint);
+          case CL_PROGRAM_DEVICES:
+            {
+              std::vector<cl_device_id> result;
+              PYOPENCL_GET_VEC_INFO(Program, m_program, param_name, result);
+
+              py::list py_result;
+              for (cl_device_id did: result)
+                py_result.append(handle_from_new_ptr(
+                      new pyopencl::device(did)));
+              return py_result;
+            }
+          case CL_PROGRAM_SOURCE:
+            PYOPENCL_GET_STR_INFO(Program, m_program, param_name);
+          case CL_PROGRAM_BINARY_SIZES:
+            {
+              std::vector<size_t> result;
+              PYOPENCL_GET_VEC_INFO(Program, m_program, param_name, result);
+              PYOPENCL_RETURN_VECTOR(size_t, result);
+            }
+          case CL_PROGRAM_BINARIES:
+            // {{{
+            {
+              std::vector<size_t> sizes;
+              PYOPENCL_GET_VEC_INFO(Program, m_program, CL_PROGRAM_BINARY_SIZES, sizes);
+
+              size_t total_size = std::accumulate(sizes.begin(), sizes.end(), 0);
+
+              std::unique_ptr<unsigned char []> result(
+                  new unsigned char[total_size]);
+              std::vector<unsigned char *> result_ptrs;
+
+              unsigned char *ptr = result.get();
+              for (unsigned i = 0; i < sizes.size(); ++i)
+              {
+                result_ptrs.push_back(ptr);
+                ptr += sizes[i];
+              }
+
+              PYOPENCL_CALL_GUARDED(clGetProgramInfo,
+                  (m_program, param_name, sizes.size()*sizeof(unsigned char *),
+                   result_ptrs.empty( ) ? nullptr : &result_ptrs.front(), 0)); \
+
+              py::list py_result;
+              ptr = result.get();
+              for (unsigned i = 0; i < sizes.size(); ++i)
+              {
+                py::object binary_pyobj(
+                    py::reinterpret_steal<py::object>(
+#if PY_VERSION_HEX >= 0x03000000
+                    PyBytes_FromStringAndSize(
+                      reinterpret_cast<char *>(ptr), sizes[i])
+#else
+                    PyString_FromStringAndSize(
+                      reinterpret_cast<char *>(ptr), sizes[i])
+#endif
+                    ));
+                py_result.append(binary_pyobj);
+                ptr += sizes[i];
+              }
+              return py_result;
+            }
+            // }}}
+#if PYOPENCL_CL_VERSION >= 0x1020
+          case CL_PROGRAM_NUM_KERNELS:
+            PYOPENCL_GET_INTEGRAL_INFO(Program, m_program, param_name,
+                size_t);
+          case CL_PROGRAM_KERNEL_NAMES:
+            PYOPENCL_GET_STR_INFO(Program, m_program, param_name);
+#endif
+
+          default:
+            throw error("Program.get_info", CL_INVALID_VALUE);
+        }
+      }
+
+      py::object get_build_info(
+          device const &dev,
+          cl_program_build_info param_name) const
+      {
+        switch (param_name)
+        {
+#define PYOPENCL_FIRST_ARG m_program, dev.data() // hackety hack
+          case CL_PROGRAM_BUILD_STATUS:
+            PYOPENCL_GET_INTEGRAL_INFO(ProgramBuild,
+                PYOPENCL_FIRST_ARG, param_name,
+                cl_build_status);
+          case CL_PROGRAM_BUILD_OPTIONS:
+          case CL_PROGRAM_BUILD_LOG:
+            PYOPENCL_GET_STR_INFO(ProgramBuild,
+                PYOPENCL_FIRST_ARG, param_name);
+#if PYOPENCL_CL_VERSION >= 0x1020
+          case CL_PROGRAM_BINARY_TYPE:
+            PYOPENCL_GET_INTEGRAL_INFO(ProgramBuild,
+                PYOPENCL_FIRST_ARG, param_name,
+                cl_program_binary_type);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2000
+          case CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE:
+            PYOPENCL_GET_INTEGRAL_INFO(ProgramBuild,
+                PYOPENCL_FIRST_ARG, param_name,
+                size_t);
+#endif
+#undef PYOPENCL_FIRST_ARG
+
+          default:
+            throw error("Program.get_build_info", CL_INVALID_VALUE);
+        }
+      }
+
+      void build(std::string options, py::object py_devices)
+      {
+        PYOPENCL_PARSE_PY_DEVICES;
+
+        PYOPENCL_CALL_GUARDED_THREADED(clBuildProgram,
+            (m_program, num_devices, devices,
+             options.c_str(), 0 ,0));
+      }
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+      void compile(std::string options, py::object py_devices,
+          py::object py_headers)
+      {
+        PYOPENCL_PARSE_PY_DEVICES;
+
+        // {{{ pick apart py_headers
+        // py_headers is a list of tuples *(name, program)*
+
+        std::vector<std::string> header_names;
+        std::vector<cl_program> programs;
+        for (py::handle name_hdr_tup_py: py_headers)
+        {
+          py::tuple name_hdr_tup = py::reinterpret_borrow<py::tuple>(name_hdr_tup_py);
+          if (py::len(name_hdr_tup) != 2)
+            throw error("Program.compile", CL_INVALID_VALUE,
+                "epxected (name, header) tuple in headers list");
+          std::string name = (name_hdr_tup[0]).cast<std::string>();
+          program &prg = (name_hdr_tup[1]).cast<program &>();
+
+          header_names.push_back(name);
+          programs.push_back(prg.data());
+        }
+
+        std::vector<const char *> header_name_ptrs;
+        for (std::string const &name: header_names)
+          header_name_ptrs.push_back(name.c_str());
+
+        // }}}
+
+        PYOPENCL_CALL_GUARDED_THREADED(clCompileProgram,
+            (m_program, num_devices, devices,
+             options.c_str(), header_names.size(),
+             programs.empty() ? nullptr : &programs.front(),
+             header_name_ptrs.empty() ? nullptr : &header_name_ptrs.front(),
+             0, 0));
+      }
+#endif
+  };
+
+
+
+
+  inline
+  program *create_program_with_source(
+      context &ctx,
+      std::string const &src)
+  {
+    const char *string = src.c_str();
+    size_t length = src.size();
+
+    cl_int status_code;
+    PYOPENCL_PRINT_CALL_TRACE("clCreateProgramWithSource");
+    cl_program result = clCreateProgramWithSource(
+        ctx.data(), 1, &string, &length, &status_code);
+    if (status_code != CL_SUCCESS)
+      throw pyopencl::error("clCreateProgramWithSource", status_code);
+
+    try
+    {
+      return new program(result, false, program::KND_SOURCE);
+    }
+    catch (...)
+    {
+      clReleaseProgram(result);
+      throw;
+    }
+  }
+
+
+
+
+
+  inline
+  program *create_program_with_binary(
+      context &ctx,
+      py::sequence py_devices,
+      py::sequence py_binaries)
+  {
+    std::vector<cl_device_id> devices;
+    std::vector<const unsigned char *> binaries;
+    std::vector<size_t> sizes;
+
+    size_t num_devices = len(py_devices);
+    if (len(py_binaries) != num_devices)
+      throw error("create_program_with_binary", CL_INVALID_VALUE,
+          "device and binary counts don't match");
+
+    for (size_t i = 0; i < num_devices; ++i)
+    {
+      devices.push_back(
+          (py_devices[i]).cast<device const &>().data());
+      const void *buf;
+      PYOPENCL_BUFFER_SIZE_T len;
+
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+      py_buffer_wrapper buf_wrapper;
+
+      buf_wrapper.get(py::object(py_binaries[i]).ptr(), PyBUF_ANY_CONTIGUOUS);
+
+      buf = buf_wrapper.m_buf.buf;
+      len = buf_wrapper.m_buf.len;
+#else
+      if (PyObject_AsReadBuffer(
+            py::object(py_binaries[i]).ptr(), &buf, &len))
+        throw py::error_already_set();
+#endif
+
+      binaries.push_back(reinterpret_cast<const unsigned char *>(buf));
+      sizes.push_back(len);
+    }
+
+    cl_int binary_statuses[num_devices];
+
+    cl_int status_code;
+    PYOPENCL_PRINT_CALL_TRACE("clCreateProgramWithBinary");
+    cl_program result = clCreateProgramWithBinary(
+        ctx.data(), num_devices,
+        devices.empty( ) ? nullptr : &devices.front(),
+        sizes.empty( ) ? nullptr : &sizes.front(),
+        binaries.empty( ) ? nullptr : &binaries.front(),
+        binary_statuses,
+        &status_code);
+    if (status_code != CL_SUCCESS)
+      throw pyopencl::error("clCreateProgramWithBinary", status_code);
+
+    /*
+    for (int i = 0; i < num_devices; ++i)
+      printf("%d:%d\n", i, binary_statuses[i]);
+      */
+
+    try
+    {
+      return new program(result, false, program::KND_BINARY);
+    }
+    catch (...)
+    {
+      clReleaseProgram(result);
+      throw;
+    }
+  }
+
+
+
+#if (PYOPENCL_CL_VERSION >= 0x1020) && \
+      ((PYOPENCL_CL_VERSION >= 0x1030) && defined(__APPLE__))
+  inline
+  program *create_program_with_built_in_kernels(
+      context &ctx,
+      py::object py_devices,
+      std::string const &kernel_names)
+  {
+    PYOPENCL_PARSE_PY_DEVICES;
+
+    cl_int status_code;
+    PYOPENCL_PRINT_CALL_TRACE("clCreateProgramWithBuiltInKernels");
+    cl_program result = clCreateProgramWithBuiltInKernels(
+        ctx.data(), num_devices, devices,
+        kernel_names.c_str(), &status_code);
+    if (status_code != CL_SUCCESS)
+      throw pyopencl::error("clCreateProgramWithBuiltInKernels", status_code);
+
+    try
+    {
+      return new program(result, false);
+    }
+    catch (...)
+    {
+      clReleaseProgram(result);
+      throw;
+    }
+  }
+#endif
+
+
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+  inline
+  program *link_program(
+      context &ctx,
+      py::object py_programs,
+      std::string const &options,
+      py::object py_devices
+      )
+  {
+    PYOPENCL_PARSE_PY_DEVICES;
+
+    std::vector<cl_program> programs;
+    for (py::handle py_prg: py_programs)
+    {
+      program &prg = (py_prg).cast<program &>();
+      programs.push_back(prg.data());
+    }
+
+    cl_int status_code;
+    PYOPENCL_PRINT_CALL_TRACE("clLinkProgram");
+    cl_program result = clLinkProgram(
+        ctx.data(), num_devices, devices,
+        options.c_str(),
+        programs.size(),
+        programs.empty() ? nullptr : &programs.front(),
+        0, 0,
+        &status_code);
+
+    if (status_code != CL_SUCCESS)
+      throw pyopencl::error("clLinkPorgram", status_code);
+
+    try
+    {
+      return new program(result, false);
+    }
+    catch (...)
+    {
+      clReleaseProgram(result);
+      throw;
+    }
+  }
+
+#endif
+
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+  inline
+  void unload_platform_compiler(platform &plat)
+  {
+    PYOPENCL_CALL_GUARDED(clUnloadPlatformCompiler, (plat.data()));
+  }
+#endif
+
+  // }}}
+
+
+  // {{{ kernel
+  class local_memory
+  {
+    private:
+      size_t m_size;
+
+    public:
+      local_memory(size_t size)
+        : m_size(size)
+      { }
+
+      size_t size() const
+      { return m_size; }
+  };
+
+
+
+
+  class kernel : noncopyable
+  {
+    private:
+      cl_kernel m_kernel;
+
+    public:
+      kernel(cl_kernel knl, bool retain)
+        : m_kernel(knl)
+      {
+        if (retain)
+          PYOPENCL_CALL_GUARDED(clRetainKernel, (knl));
+      }
+
+      kernel(program const &prg, std::string const &kernel_name)
+      {
+        cl_int status_code;
+
+        PYOPENCL_PRINT_CALL_TRACE("clCreateKernel");
+        m_kernel = clCreateKernel(prg.data(), kernel_name.c_str(),
+            &status_code);
+        if (status_code != CL_SUCCESS)
+          throw pyopencl::error("clCreateKernel", status_code);
+      }
+
+      ~kernel()
+      {
+        PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseKernel, (m_kernel));
+      }
+
+      cl_kernel data() const
+      {
+        return m_kernel;
+      }
+
+      PYOPENCL_EQUALITY_TESTS(kernel);
+
+      void set_arg_null(cl_uint arg_index)
+      {
+        cl_mem m = 0;
+        PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index,
+              sizeof(cl_mem), &m));
+      }
+
+      void set_arg_mem(cl_uint arg_index, memory_object_holder &moh)
+      {
+        cl_mem m = moh.data();
+        PYOPENCL_CALL_GUARDED(clSetKernelArg,
+            (m_kernel, arg_index, sizeof(cl_mem), &m));
+      }
+
+      void set_arg_local(cl_uint arg_index, local_memory const &loc)
+      {
+        PYOPENCL_CALL_GUARDED(clSetKernelArg,
+            (m_kernel, arg_index, loc.size(), 0));
+      }
+
+      void set_arg_sampler(cl_uint arg_index, sampler const &smp)
+      {
+        cl_sampler s = smp.data();
+        PYOPENCL_CALL_GUARDED(clSetKernelArg,
+            (m_kernel, arg_index, sizeof(cl_sampler), &s));
+      }
+
+      void set_arg_buf(cl_uint arg_index, py::object py_buffer)
+      {
+        const void *buf;
+        PYOPENCL_BUFFER_SIZE_T len;
+
+#ifdef PYOPENCL_USE_NEW_BUFFER_INTERFACE
+        py_buffer_wrapper buf_wrapper;
+
+        try
+        {
+          buf_wrapper.get(py_buffer.ptr(), PyBUF_ANY_CONTIGUOUS);
+        }
+        catch (py::error_already_set &)
+        {
+          PyErr_Clear();
+          throw error("Kernel.set_arg", CL_INVALID_VALUE,
+              "invalid kernel argument");
+        }
+
+        buf = buf_wrapper.m_buf.buf;
+        len = buf_wrapper.m_buf.len;
+#else
+        if (PyObject_AsReadBuffer(py_buffer.ptr(), &buf, &len))
+        {
+          PyErr_Clear();
+          throw error("Kernel.set_arg", CL_INVALID_VALUE,
+              "invalid kernel argument");
+        }
+#endif
+
+        PYOPENCL_CALL_GUARDED(clSetKernelArg,
+            (m_kernel, arg_index, len, buf));
+      }
+
+#if PYOPENCL_CL_VERSION >= 0x2000
+      void set_arg_svm(cl_uint arg_index, svm_arg_wrapper const &wrp)
+      {
+        PYOPENCL_CALL_GUARDED(clSetKernelArgSVMPointer,
+            (m_kernel, arg_index, wrp.ptr()));
+      }
+#endif
+
+      void set_arg(cl_uint arg_index, py::object arg)
+      {
+        if (arg.ptr() == Py_None)
+        {
+          set_arg_null(arg_index);
+          return;
+        }
+
+        try
+        {
+          set_arg_mem(arg_index, arg.cast<memory_object_holder &>());
+          return;
+        }
+        catch (py::cast_error &) { }
+
+#if PYOPENCL_CL_VERSION >= 0x2000
+        try
+        {
+          set_arg_svm(arg_index, arg.cast<svm_arg_wrapper const &>());
+          return;
+        }
+        catch (py::cast_error &) { }
+#endif
+
+        try
+        {
+          set_arg_local(arg_index, arg.cast<local_memory>());
+          return;
+        }
+        catch (py::cast_error &) { }
+
+        try
+        {
+          set_arg_sampler(arg_index, arg.cast<const sampler &>());
+          return;
+        }
+        catch (py::cast_error &) { }
+
+        set_arg_buf(arg_index, arg);
+      }
+
+      py::object get_info(cl_kernel_info param_name) const
+      {
+        switch (param_name)
+        {
+          case CL_KERNEL_FUNCTION_NAME:
+            PYOPENCL_GET_STR_INFO(Kernel, m_kernel, param_name);
+          case CL_KERNEL_NUM_ARGS:
+          case CL_KERNEL_REFERENCE_COUNT:
+            PYOPENCL_GET_INTEGRAL_INFO(Kernel, m_kernel, param_name,
+                cl_uint);
+          case CL_KERNEL_CONTEXT:
+            PYOPENCL_GET_OPAQUE_INFO(Kernel, m_kernel, param_name,
+                cl_context, context);
+          case CL_KERNEL_PROGRAM:
+            PYOPENCL_GET_OPAQUE_INFO(Kernel, m_kernel, param_name,
+                cl_program, program);
+#if PYOPENCL_CL_VERSION >= 0x1020
+          case CL_KERNEL_ATTRIBUTES:
+            PYOPENCL_GET_STR_INFO(Kernel, m_kernel, param_name);
+#endif
+          default:
+            throw error("Kernel.get_info", CL_INVALID_VALUE);
+        }
+      }
+
+      py::object get_work_group_info(
+          cl_kernel_work_group_info param_name,
+          device const &dev
+          ) const
+      {
+        switch (param_name)
+        {
+#define PYOPENCL_FIRST_ARG m_kernel, dev.data() // hackety hack
+          case CL_KERNEL_WORK_GROUP_SIZE:
+            PYOPENCL_GET_INTEGRAL_INFO(KernelWorkGroup,
+                PYOPENCL_FIRST_ARG, param_name,
+                size_t);
+          case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
+            {
+              std::vector<size_t> result;
+              PYOPENCL_GET_VEC_INFO(KernelWorkGroup,
+                  PYOPENCL_FIRST_ARG, param_name, result);
+
+              PYOPENCL_RETURN_VECTOR(size_t, result);
+            }
+          case CL_KERNEL_LOCAL_MEM_SIZE:
+#if PYOPENCL_CL_VERSION >= 0x1010
+          case CL_KERNEL_PRIVATE_MEM_SIZE:
+#endif
+            PYOPENCL_GET_INTEGRAL_INFO(KernelWorkGroup,
+                PYOPENCL_FIRST_ARG, param_name,
+                cl_ulong);
+
+#if PYOPENCL_CL_VERSION >= 0x1010
+          case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
+            PYOPENCL_GET_INTEGRAL_INFO(KernelWorkGroup,
+                PYOPENCL_FIRST_ARG, param_name,
+                size_t);
+#endif
+          default:
+            throw error("Kernel.get_work_group_info", CL_INVALID_VALUE);
+#undef PYOPENCL_FIRST_ARG
+        }
+      }
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+      py::object get_arg_info(
+          cl_uint arg_index,
+          cl_kernel_arg_info param_name
+          ) const
+      {
+        switch (param_name)
+        {
+#define PYOPENCL_FIRST_ARG m_kernel, arg_index // hackety hack
+          case CL_KERNEL_ARG_ADDRESS_QUALIFIER:
+            PYOPENCL_GET_INTEGRAL_INFO(KernelArg,
+                PYOPENCL_FIRST_ARG, param_name,
+                cl_kernel_arg_address_qualifier);
+
+          case CL_KERNEL_ARG_ACCESS_QUALIFIER:
+            PYOPENCL_GET_INTEGRAL_INFO(KernelArg,
+                PYOPENCL_FIRST_ARG, param_name,
+                cl_kernel_arg_access_qualifier);
+
+          case CL_KERNEL_ARG_TYPE_NAME:
+          case CL_KERNEL_ARG_NAME:
+            PYOPENCL_GET_STR_INFO(KernelArg, PYOPENCL_FIRST_ARG, param_name);
+#undef PYOPENCL_FIRST_ARG
+          default:
+            throw error("Kernel.get_arg_info", CL_INVALID_VALUE);
+        }
+      }
+#endif
+  };
+
+
+  inline
+  py::list create_kernels_in_program(program &pgm)
+  {
+    cl_uint num_kernels;
+    PYOPENCL_CALL_GUARDED(clCreateKernelsInProgram, (
+          pgm.data(), 0, 0, &num_kernels));
+
+    std::vector<cl_kernel> kernels(num_kernels);
+    PYOPENCL_CALL_GUARDED(clCreateKernelsInProgram, (
+          pgm.data(), num_kernels,
+          kernels.empty( ) ? nullptr : &kernels.front(), &num_kernels));
+
+    py::list result;
+    for (cl_kernel knl: kernels)
+      result.append(handle_from_new_ptr(new kernel(knl, true)));
+
+    return result;
+  }
+
+
+
+  inline
+  event *enqueue_nd_range_kernel(
+      command_queue &cq,
+      kernel &knl,
+      py::object py_global_work_size,
+      py::object py_local_work_size,
+      py::object py_global_work_offset,
+      py::object py_wait_for,
+      bool g_times_l)
+  {
+    PYOPENCL_PARSE_WAIT_FOR;
+
+    cl_uint work_dim = len(py_global_work_size);
+
+    std::vector<size_t> global_work_size;
+    COPY_PY_LIST(size_t, global_work_size);
+
+    size_t *local_work_size_ptr = 0;
+    std::vector<size_t> local_work_size;
+    if (py_local_work_size.ptr() != Py_None)
+    {
+      if (g_times_l)
+        work_dim = std::max(work_dim, unsigned(len(py_local_work_size)));
+      else
+        if (work_dim != unsigned(len(py_local_work_size)))
+          throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE,
+              "global/local work sizes have differing dimensions");
+
+      COPY_PY_LIST(size_t, local_work_size);
+
+      while (local_work_size.size() < work_dim)
+        local_work_size.push_back(1);
+      while (global_work_size.size() < work_dim)
+        global_work_size.push_back(1);
+
+      local_work_size_ptr = local_work_size.empty( ) ? nullptr : &local_work_size.front();
+    }
+
+    if (g_times_l && local_work_size_ptr)
+    {
+      for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
+        global_work_size[work_axis] *= local_work_size[work_axis];
+    }
+
+    size_t *global_work_offset_ptr = 0;
+    std::vector<size_t> global_work_offset;
+    if (py_global_work_offset.ptr() != Py_None)
+    {
+      if (work_dim != unsigned(len(py_global_work_offset)))
+        throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE,
+            "global work size and offset have differing dimensions");
+
+      COPY_PY_LIST(size_t, global_work_offset);
+
+      if (g_times_l && local_work_size_ptr)
+      {
+        for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
+          global_work_offset[work_axis] *= local_work_size[work_axis];
+      }
+
+      global_work_offset_ptr = global_work_offset.empty( ) ? nullptr :  &global_work_offset.front();
+    }
+
+    PYOPENCL_RETRY_RETURN_IF_MEM_ERROR( {
+          cl_event evt;
+          PYOPENCL_CALL_GUARDED(clEnqueueNDRangeKernel, (
+                cq.data(),
+                knl.data(),
+                work_dim,
+                global_work_offset_ptr,
+                global_work_size.empty( ) ? nullptr : &global_work_size.front(),
+                local_work_size_ptr,
+                PYOPENCL_WAITLIST_ARGS, &evt
+                ));
+          PYOPENCL_RETURN_NEW_EVENT(evt);
+        } );
+  }
+
+  // }}}
+
+
+  // {{{ gl interop
+  inline
+  bool have_gl()
+  {
+#ifdef HAVE_GL
+    return true;
+#else
+    return false;
+#endif
+  }
+
+
+
+
+#ifdef HAVE_GL
+
+#ifdef __APPLE__
+  inline
+  cl_context_properties get_apple_cgl_share_group()
+  {
+    CGLContextObj kCGLContext = CGLGetCurrentContext();
+    CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext);
+
+    return (cl_context_properties) kCGLShareGroup;
+  }
+#endif /* __APPLE__ */
+
+
+
+
+  class gl_buffer : public memory_object
+  {
+    public:
+      gl_buffer(cl_mem mem, bool retain, hostbuf_t hostbuf=hostbuf_t())
+        : memory_object(mem, retain, PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(hostbuf))
+      { }
+  };
+
+
+
+
+  class gl_renderbuffer : public memory_object
+  {
+    public:
+      gl_renderbuffer(cl_mem mem, bool retain, hostbuf_t hostbuf=hostbuf_t())
+        : memory_object(mem, retain, PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(hostbuf))
+      { }
+  };
+
+
+
+
+  class gl_texture : public image
+  {
+    public:
+      gl_texture(cl_mem mem, bool retain, hostbuf_t hostbuf=hostbuf_t())
+        : image(mem, retain, PYOPENCL_STD_MOVE_IF_NEW_BUF_INTF(hostbuf))
+      { }
+
+      py::object get_gl_texture_info(cl_gl_texture_info param_name)
+      {
+        switch (param_name)
+        {
+          case CL_GL_TEXTURE_TARGET:
+            PYOPENCL_GET_INTEGRAL_INFO(GLTexture, data(), param_name, GLenum);
+          case CL_GL_MIPMAP_LEVEL:
+            PYOPENCL_GET_INTEGRAL_INFO(GLTexture, data(), param_name, GLint);
+
+          default:
+            throw error("MemoryObject.get_gl_texture_info", CL_INVALID_VALUE);
+        }
+      }
+  };
+
+
+
+
+#define PYOPENCL_WRAP_BUFFER_CREATOR(TYPE, NAME, CL_NAME, ARGS, CL_ARGS) \
+  inline \
+  TYPE *NAME ARGS \
+  { \
+    cl_int status_code; \
+    PYOPENCL_PRINT_CALL_TRACE(#CL_NAME); \
+    cl_mem mem = CL_NAME CL_ARGS; \
+    \
+    if (status_code != CL_SUCCESS) \
+      throw pyopencl::error(#CL_NAME, status_code); \
+    \
+    try \
+    { \
+      return new TYPE(mem, false); \
+    } \
+    catch (...) \
+    { \
+      PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem)); \
+      throw; \
+    } \
+  }
+
+
+
+
+  PYOPENCL_WRAP_BUFFER_CREATOR(gl_buffer,
+      create_from_gl_buffer, clCreateFromGLBuffer,
+      (context &ctx, cl_mem_flags flags, GLuint bufobj),
+      (ctx.data(), flags, bufobj, &status_code));
+  PYOPENCL_WRAP_BUFFER_CREATOR(gl_texture,
+      create_from_gl_texture_2d, clCreateFromGLTexture2D,
+      (context &ctx, cl_mem_flags flags,
+         GLenum texture_target, GLint miplevel, GLuint texture),
+      (ctx.data(), flags, texture_target, miplevel, texture, &status_code));
+  PYOPENCL_WRAP_BUFFER_CREATOR(gl_texture,
+      create_from_gl_texture_3d, clCreateFromGLTexture3D,
+      (context &ctx, cl_mem_flags flags,
+         GLenum texture_target, GLint miplevel, GLuint texture),
+      (ctx.data(), flags, texture_target, miplevel, texture, &status_code));
+  PYOPENCL_WRAP_BUFFER_CREATOR(gl_renderbuffer,
+      create_from_gl_renderbuffer, clCreateFromGLRenderbuffer,
+      (context &ctx, cl_mem_flags flags, GLuint renderbuffer),
+      (ctx.data(), flags, renderbuffer, &status_code));
+
+  inline
+  gl_texture *create_from_gl_texture(
+      context &ctx, cl_mem_flags flags,
+      GLenum texture_target, GLint miplevel,
+      GLuint texture, unsigned dims)
+  {
+    if (dims == 2)
+      return create_from_gl_texture_2d(ctx, flags, texture_target, miplevel, texture);
+    else if (dims == 3)
+      return create_from_gl_texture_3d(ctx, flags, texture_target, miplevel, texture);
+    else
+      throw pyopencl::error("Image", CL_INVALID_VALUE,
+          "invalid dimension");
+  }
+
+
+
+
+
+  inline
+  py::tuple get_gl_object_info(memory_object_holder const &mem)
+  {
+    cl_gl_object_type otype;
+    GLuint gl_name;
+    PYOPENCL_CALL_GUARDED(clGetGLObjectInfo, (mem.data(), &otype, &gl_name));
+    return py::make_tuple(otype, gl_name);
+  }
+
+#define WRAP_GL_ENQUEUE(what, What) \
+  inline \
+  event *enqueue_##what##_gl_objects( \
+      command_queue &cq, \
+      py::object py_mem_objects, \
+      py::object py_wait_for) \
+  { \
+    PYOPENCL_PARSE_WAIT_FOR; \
+    \
+    std::vector<cl_mem> mem_objects; \
+    for (py::handle mo: py_mem_objects) \
+      mem_objects.push_back((mo).cast<memory_object_holder &>().data()); \
+    \
+    cl_event evt; \
+    PYOPENCL_CALL_GUARDED(clEnqueue##What##GLObjects, ( \
+          cq.data(), \
+          mem_objects.size(), mem_objects.empty( ) ? nullptr : &mem_objects.front(), \
+          PYOPENCL_WAITLIST_ARGS, &evt \
+          )); \
+    \
+    PYOPENCL_RETURN_NEW_EVENT(evt); \
+  }
+
+  WRAP_GL_ENQUEUE(acquire, Acquire);
+  WRAP_GL_ENQUEUE(release, Release);
+#endif
+
+
+
+
+#if defined(cl_khr_gl_sharing) && (cl_khr_gl_sharing >= 1)
+  inline
+  py::object get_gl_context_info_khr(
+      py::object py_properties,
+      cl_gl_context_info param_name,
+      py::object py_platform
+      )
+  {
+    std::vector<cl_context_properties> props
+      = parse_context_properties(py_properties);
+
+    typedef CL_API_ENTRY cl_int (CL_API_CALL
+      *func_ptr_type)(const cl_context_properties * /* properties */,
+          cl_gl_context_info            /* param_name */,
+          size_t                        /* param_value_size */,
+          void *                        /* param_value */,
+          size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+    func_ptr_type func_ptr;
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+    if (py_platform.ptr() != Py_None)
+    {
+      platform &plat = (py_platform).cast<platform &>();
+
+      func_ptr = (func_ptr_type) clGetExtensionFunctionAddressForPlatform(
+            plat.data(), "clGetGLContextInfoKHR");
+    }
+    else
+    {
+      PYOPENCL_DEPRECATED("get_gl_context_info_khr with platform=None", "2013.1", );
+
+      func_ptr = (func_ptr_type) clGetExtensionFunctionAddress(
+            "clGetGLContextInfoKHR");
+    }
+#else
+    func_ptr = (func_ptr_type) clGetExtensionFunctionAddress(
+          "clGetGLContextInfoKHR");
+#endif
+
+
+    if (!func_ptr)
+      throw error("Context.get_info", CL_INVALID_PLATFORM,
+          "clGetGLContextInfoKHR extension function not present");
+
+    cl_context_properties *props_ptr
+      = props.empty( ) ? nullptr : &props.front();
+
+    switch (param_name)
+    {
+      case CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR:
+        {
+          cl_device_id param_value;
+          PYOPENCL_CALL_GUARDED(func_ptr,
+              (props_ptr, param_name, sizeof(param_value), &param_value, 0));
+          return py::object(handle_from_new_ptr( \
+                new device(param_value, /*retain*/ true)));
+        }
+
+      case CL_DEVICES_FOR_GL_CONTEXT_KHR:
+        {
+          size_t size;
+          PYOPENCL_CALL_GUARDED(func_ptr,
+              (props_ptr, param_name, 0, 0, &size));
+
+          std::vector<cl_device_id> devices;
+
+          devices.resize(size / sizeof(devices.front()));
+
+          PYOPENCL_CALL_GUARDED(func_ptr,
+              (props_ptr, param_name, size,
+               devices.empty( ) ? nullptr : &devices.front(), &size));
+
+          py::list result;
+          for (cl_device_id did: devices)
+            result.append(handle_from_new_ptr(
+                  new device(did)));
+
+          return result;
+        }
+
+      default:
+        throw error("get_gl_context_info_khr", CL_INVALID_VALUE);
+    }
+  }
+
+#endif
+
+  // }}}
+
+
+  // {{{ deferred implementation bits
+
+  inline py::object create_mem_object_wrapper(cl_mem mem, bool retain=true)
+  {
+    cl_mem_object_type mem_obj_type;
+    PYOPENCL_CALL_GUARDED(clGetMemObjectInfo, \
+        (mem, CL_MEM_TYPE, sizeof(mem_obj_type), &mem_obj_type, 0));
+
+    switch (mem_obj_type)
+    {
+      case CL_MEM_OBJECT_BUFFER:
+        return py::object(handle_from_new_ptr(
+              new buffer(mem, retain)));
+      case CL_MEM_OBJECT_IMAGE2D:
+      case CL_MEM_OBJECT_IMAGE3D:
+#if PYOPENCL_CL_VERSION >= 0x1020
+      case CL_MEM_OBJECT_IMAGE2D_ARRAY:
+      case CL_MEM_OBJECT_IMAGE1D:
+      case CL_MEM_OBJECT_IMAGE1D_ARRAY:
+      case CL_MEM_OBJECT_IMAGE1D_BUFFER:
+#endif
+        return py::object(handle_from_new_ptr(
+              new image(mem, retain)));
+      default:
+        return py::object(handle_from_new_ptr(
+              new memory_object(mem, retain)));
+    }
+  }
+
+  inline
+  py::object memory_object_from_int(intptr_t cl_mem_as_int, bool retain)
+  {
+    return create_mem_object_wrapper((cl_mem) cl_mem_as_int, retain);
+  }
+
+
+  inline
+  py::object memory_object_holder::get_info(cl_mem_info param_name) const
+  {
+    switch (param_name)
+    {
+      case CL_MEM_TYPE:
+        PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+            cl_mem_object_type);
+      case CL_MEM_FLAGS:
+        PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+            cl_mem_flags);
+      case CL_MEM_SIZE:
+        PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+            size_t);
+      case CL_MEM_HOST_PTR:
+        throw pyopencl::error("MemoryObject.get_info", CL_INVALID_VALUE,
+            "Use MemoryObject.get_host_array to get host pointer.");
+      case CL_MEM_MAP_COUNT:
+        PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+            cl_uint);
+      case CL_MEM_REFERENCE_COUNT:
+        PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+            cl_uint);
+      case CL_MEM_CONTEXT:
+        PYOPENCL_GET_OPAQUE_INFO(MemObject, data(), param_name,
+            cl_context, context);
+
+#if PYOPENCL_CL_VERSION >= 0x1010
+      case CL_MEM_ASSOCIATED_MEMOBJECT:
+        {
+          cl_mem param_value;
+          PYOPENCL_CALL_GUARDED(clGetMemObjectInfo, \
+              (data(), param_name, sizeof(param_value), &param_value, 0));
+          if (param_value == 0)
+          {
+            // no associated memory object? no problem.
+            return py::none();
+          }
+
+          return create_mem_object_wrapper(param_value);
+        }
+      case CL_MEM_OFFSET:
+        PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+            size_t);
+#endif
+
+      default:
+        throw error("MemoryObjectHolder.get_info", CL_INVALID_VALUE);
+    }
+  }
+
+  // FIXME: Reenable in pypy
+#ifndef PYPY_VERSION
+  inline
+  py::object get_mem_obj_host_array(
+      py::object mem_obj_py,
+      py::object shape, py::object dtype,
+      py::object order_py)
+  {
+    memory_object_holder const &mem_obj =
+      (mem_obj_py).cast<memory_object_holder const &>();
+    PyArray_Descr *tp_descr;
+    if (PyArray_DescrConverter(dtype.ptr(), &tp_descr) != NPY_SUCCEED)
+      throw py::error_already_set();
+    cl_mem_flags mem_flags;
+    PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
+            (mem_obj.data(), CL_MEM_FLAGS, sizeof(mem_flags), &mem_flags, 0));
+    if (!(mem_flags & CL_MEM_USE_HOST_PTR))
+      throw pyopencl::error("MemoryObject.get_host_array", CL_INVALID_VALUE,
+                            "Only MemoryObject with USE_HOST_PTR "
+                            "is supported.");
+
+    std::vector<npy_intp> dims;
+    try
+    {
+      dims.push_back(py::cast<npy_intp>(shape));
+    }
+    catch (py::cast_error &)
+    {
+      for (auto it: shape)
+        dims.push_back(it.cast<npy_intp>());
+    }
+
+    NPY_ORDER order = PyArray_CORDER;
+    PyArray_OrderConverter(order_py.ptr(), &order);
+
+    int ary_flags = 0;
+    if (order == PyArray_FORTRANORDER)
+      ary_flags |= NPY_FARRAY;
+    else if (order == PyArray_CORDER)
+      ary_flags |= NPY_CARRAY;
+    else
+      throw std::runtime_error("unrecognized order specifier");
+
+    void *host_ptr;
+    size_t mem_obj_size;
+    PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
+        (mem_obj.data(), CL_MEM_HOST_PTR, sizeof(host_ptr),
+         &host_ptr, 0));
+    PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
+        (mem_obj.data(), CL_MEM_SIZE, sizeof(mem_obj_size),
+         &mem_obj_size, 0));
+
+    py::object result = py::reinterpret_steal<py::object>(PyArray_NewFromDescr(
+        &PyArray_Type, tp_descr,
+        dims.size(), &dims.front(), /*strides*/ nullptr,
+        host_ptr, ary_flags, /*obj*/nullptr));
+
+    if ((size_t) PyArray_NBYTES(result.ptr()) > mem_obj_size)
+      throw pyopencl::error("MemoryObject.get_host_array",
+          CL_INVALID_VALUE,
+          "Resulting array is larger than memory object.");
+
+    PyArray_BASE(result.ptr()) = mem_obj_py.ptr();
+    Py_INCREF(mem_obj_py.ptr());
+
+    return result;
+  }
+#endif
+
+  // }}}
+}
+
+#endif
+
+// vim: foldmethod=marker
diff --git a/src/wrap_cl_part_1.cpp b/src/wrap_cl_part_1.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c3e6d5e81d2db08e43d60abcfa4b6c6cbfe32b3b
--- /dev/null
+++ b/src/wrap_cl_part_1.cpp
@@ -0,0 +1,328 @@
+#include "wrap_cl.hpp"
+
+
+using namespace pyopencl;
+
+
+void pyopencl_expose_part_1(py::module &m)
+{
+  m.def("get_cl_header_version", get_cl_header_version);
+  m.def("_sizeof_size_t", [](){ return sizeof(size_t); });
+
+  // {{{ platform
+  DEF_SIMPLE_FUNCTION(get_platforms);
+
+  {
+    typedef platform cls;
+    py::class_<cls>(m, "Platform", py::dynamic_attr())
+      .DEF_SIMPLE_METHOD(get_info)
+      .def("get_devices", &cls::get_devices,
+          py::arg("device_type")=CL_DEVICE_TYPE_ALL)
+      .def(py::self == py::self)
+      .def(py::self != py::self)
+      .def("__hash__", &cls::hash)
+      PYOPENCL_EXPOSE_TO_FROM_INT_PTR(cl_platform_id)
+      ;
+  }
+
+  // }}}
+
+  // {{{ device
+  {
+    typedef device cls;
+    py::class_<cls>(m, "Device", py::dynamic_attr())
+      .DEF_SIMPLE_METHOD(get_info)
+      .def(py::self == py::self)
+      .def(py::self != py::self)
+      .def("__hash__", &cls::hash)
+#if PYOPENCL_CL_VERSION >= 0x1020
+      .DEF_SIMPLE_METHOD(create_sub_devices)
+#endif
+      PYOPENCL_EXPOSE_TO_FROM_INT_PTR(cl_device_id)
+      ;
+  }
+
+  // }}}
+
+  // {{{ context
+
+  {
+    typedef context cls;
+    py::class_<cls, std::shared_ptr<cls>>(m, "Context", py::dynamic_attr())
+      .def(
+          py::init(
+            [](py::object py_devices, py::object py_properties,
+              py::object py_dev_type)
+            {
+              PYOPENCL_RETRY_RETURN_IF_MEM_ERROR(
+                  return create_context_inner(
+                    py_devices,
+                    py_properties,
+                    py_dev_type);
+              )
+            }),
+          py::arg("devices")=py::none(),
+          py::arg("properties")=py::none(),
+          py::arg("dev_type")=py::none()
+          )
+      .DEF_SIMPLE_METHOD(get_info)
+      .def(py::self == py::self)
+      .def(py::self != py::self)
+      .def("__hash__", &cls::hash)
+      PYOPENCL_EXPOSE_TO_FROM_INT_PTR(cl_context)
+      ;
+  }
+
+  // }}}
+
+  // {{{ command queue
+  {
+    typedef command_queue cls;
+    py::class_<cls, std::shared_ptr<cls>>(m, "CommandQueue", py::dynamic_attr())
+      .def(
+        py::init<const context &, const device *, py::object>(),
+        py::arg("context"),
+        py::arg("device").none(true)=py::none(),
+        py::arg("properties")=py::cast(0))
+      .DEF_SIMPLE_METHOD(get_info)
+#if PYOPENCL_CL_VERSION < 0x1010
+      .DEF_SIMPLE_METHOD(set_property)
+#endif
+      .DEF_SIMPLE_METHOD(flush)
+      .DEF_SIMPLE_METHOD(finish)
+      .def(py::self == py::self)
+      .def(py::self != py::self)
+      .def("__hash__", &cls::hash)
+      PYOPENCL_EXPOSE_TO_FROM_INT_PTR(cl_command_queue)
+      ;
+  }
+
+  // }}}
+
+  // {{{ events/synchronization
+  {
+    typedef event cls;
+    py::class_<cls>(m, "Event", py::dynamic_attr())
+      .DEF_SIMPLE_METHOD(get_info)
+      .DEF_SIMPLE_METHOD(get_profiling_info)
+      .DEF_SIMPLE_METHOD(wait)
+      .def(py::self == py::self)
+      .def(py::self != py::self)
+      .def("__hash__", &cls::hash)
+      PYOPENCL_EXPOSE_TO_FROM_INT_PTR(cl_event)
+#if PYOPENCL_CL_VERSION >= 0x1010
+      .DEF_SIMPLE_METHOD(set_callback)
+#endif
+      ;
+  }
+  {
+    typedef nanny_event cls;
+    py::class_<cls, event>(m, "NannyEvent", py::dynamic_attr())
+      .DEF_SIMPLE_METHOD(get_ward)
+      ;
+  }
+
+  DEF_SIMPLE_FUNCTION(wait_for_events);
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+  m.def("_enqueue_marker_with_wait_list", enqueue_marker_with_wait_list,
+      py::arg("queue"), py::arg("wait_for")=py::none()
+      );
+#endif
+  m.def("_enqueue_marker", enqueue_marker,
+      py::arg("queue")
+      );
+  m.def("_enqueue_wait_for_events", enqueue_wait_for_events,
+      py::arg("queue"), py::arg("wait_for")=py::none());
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+  m.def("_enqueue_barrier_with_wait_list", enqueue_barrier_with_wait_list,
+      py::arg("queue"), py::arg("wait_for")=py::none()
+      );
+#endif
+  m.def("_enqueue_barrier", enqueue_barrier, py::arg("queue"));
+
+#if PYOPENCL_CL_VERSION >= 0x1010
+  {
+    typedef user_event cls;
+    py::class_<cls, event>(m, "UserEvent", py::dynamic_attr())
+      .def(py::init(
+            [](context &ctx)
+            {
+              return create_user_event(ctx);
+            }),
+          py::arg("context"))
+      .DEF_SIMPLE_METHOD(set_status)
+      ;
+  }
+#endif
+
+  // }}}
+
+  // {{{ memory_object
+
+  {
+    typedef memory_object_holder cls;
+    py::class_<cls>(m, "MemoryObjectHolder", py::dynamic_attr())
+      .DEF_SIMPLE_METHOD(get_info)
+  // FIXME: Reenable in pypy
+#ifndef PYPY_VERSION
+      .def("get_host_array", get_mem_obj_host_array,
+          py::arg("shape"),
+          py::arg("dtype"),
+          py::arg("order")="C")
+#endif
+      .def("__eq__", [](const cls &self, const cls &other){ return self == other; })
+      .def("__ne__", [](const cls &self, const cls &other){ return self != other; })
+      .def("__hash__", &cls::hash)
+
+      .def_property_readonly("int_ptr", to_int_ptr<cls>,
+          "Return an integer corresponding to the pointer value "
+          "of the underlying :c:type:`cl_mem`. "
+          "Use :meth:`from_int_ptr` to turn back into a Python object."
+          "\n\n.. versionadded:: 2013.2\n")
+      ;
+  }
+  {
+    typedef memory_object cls;
+    py::class_<cls, memory_object_holder>(m, "MemoryObject", py::dynamic_attr())
+      .DEF_SIMPLE_METHOD(release)
+      .def_property_readonly("hostbuf", &cls::hostbuf)
+
+      .def_static("from_int_ptr", memory_object_from_int,
+        "(static method) Return a new Python object referencing the C-level "
+        ":c:type:`cl_mem` object at the location pointed to "
+        "by *int_ptr_value*. The relevant :c:func:`clRetain*` function "
+        "will be called if *retain* is True."
+        "If the previous owner of the object will *not* release the reference, "
+        "*retain* should be set to *False*, to effectively transfer ownership to "
+        ":mod:`pyopencl`."
+        "\n\n.. versionadded:: 2013.2\n"
+        "\n\n.. versionchanged:: 2016.1\n\n    *retain* added.",
+        py::arg("int_ptr_value"),
+        py::arg("retain")=true)
+      ;
+  }
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+  m.def("enqueue_migrate_mem_objects", enqueue_migrate_mem_objects,
+      py::arg("queue"),
+      py::arg("mem_objects"),
+      py::arg("flags")=0,
+      py::arg("wait_for")=py::none()
+      );
+#endif
+
+  // }}}
+
+  // {{{ buffer
+  {
+    typedef buffer cls;
+    py::class_<cls, memory_object>(m, "Buffer", py::dynamic_attr())
+      .def(
+          py::init(
+            [](context &ctx, cl_mem_flags flags, size_t size, py::object py_hostbuf)
+            { return create_buffer_py(ctx, flags, size, py_hostbuf); }
+            ),
+          py::arg("context"),
+          py::arg("flags"),
+          py::arg("size")=0,
+          py::arg("hostbuf")=py::none()
+          )
+#if PYOPENCL_CL_VERSION >= 0x1010
+      .def("get_sub_region", &cls::get_sub_region,
+          py::arg("origin"),
+          py::arg("size"),
+          py::arg("flags")=0
+          )
+      .def("__getitem__", &cls::getitem)
+#endif
+      ;
+  }
+
+  // }}}
+
+  // {{{ transfers
+
+  // {{{ byte-for-byte
+  m.def("_enqueue_read_buffer", enqueue_read_buffer,
+      py::arg("queue"),
+      py::arg("mem"),
+      py::arg("hostbuf"),
+      py::arg("device_offset")=0,
+      py::arg("wait_for")=py::none(),
+      py::arg("is_blocking")=true
+      );
+  m.def("_enqueue_write_buffer", enqueue_write_buffer,
+      py::arg("queue"),
+      py::arg("mem"),
+      py::arg("hostbuf"),
+      py::arg("device_offset")=0,
+      py::arg("wait_for")=py::none(),
+      py::arg("is_blocking")=true
+      );
+  m.def("_enqueue_copy_buffer", enqueue_copy_buffer,
+      py::arg("queue"),
+      py::arg("src"),
+      py::arg("dst"),
+      py::arg("byte_count")=-1,
+      py::arg("src_offset")=0,
+      py::arg("dst_offset")=0,
+      py::arg("wait_for")=py::none()
+      );
+
+  // }}}
+
+  // {{{ rectangular
+
+#if PYOPENCL_CL_VERSION >= 0x1010
+  m.def("_enqueue_read_buffer_rect", enqueue_read_buffer_rect,
+      py::arg("queue"),
+      py::arg("mem"),
+      py::arg("hostbuf"),
+      py::arg("buffer_origin"),
+      py::arg("host_origin"),
+      py::arg("region"),
+      py::arg("buffer_pitches")=py::none(),
+      py::arg("host_pitches")=py::none(),
+      py::arg("wait_for")=py::none(),
+      py::arg("is_blocking")=true
+      );
+  m.def("_enqueue_write_buffer_rect", enqueue_write_buffer_rect,
+      py::arg("queue"),
+      py::arg("mem"),
+      py::arg("hostbuf"),
+      py::arg("buffer_origin"),
+      py::arg("host_origin"),
+      py::arg("region"),
+      py::arg("buffer_pitches")=py::none(),
+      py::arg("host_pitches")=py::none(),
+      py::arg("wait_for")=py::none(),
+      py::arg("is_blocking")=true
+      );
+  m.def("_enqueue_copy_buffer_rect", enqueue_copy_buffer_rect,
+      py::arg("queue"),
+      py::arg("src"),
+      py::arg("dst"),
+      py::arg("src_origin"),
+      py::arg("dst_origin"),
+      py::arg("region"),
+      py::arg("src_pitches")=py::none(),
+      py::arg("dst_pitches")=py::none(),
+      py::arg("wait_for")=py::none()
+      );
+#endif
+
+  // }}}
+
+  // }}}
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+  m.def("_enqueue_fill_buffer", enqueue_fill_buffer,
+      py::arg("queue"), py::arg("mem"), py::arg("pattern"),
+      py::arg("offset"), py::arg("size"),
+      py::arg("wait_for")=py::none());
+#endif
+}
+
+// vim: foldmethod=marker
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7b4c2c9e8eea5d18fdb68cdc0549ab781b353580
--- /dev/null
+++ b/src/wrap_cl_part_2.cpp
@@ -0,0 +1,559 @@
+#include "wrap_cl.hpp"
+
+
+
+
+namespace pyopencl {
+#if PYOPENCL_CL_VERSION >= 0x1020
+  py::object image_desc_dummy_getter(cl_image_desc &desc)
+  {
+    return py::none();
+  }
+
+  void image_desc_set_shape(cl_image_desc &desc, py::object py_shape)
+  {
+    COPY_PY_REGION_TRIPLE(shape);
+    desc.image_width = shape[0];
+    desc.image_height = shape[1];
+    desc.image_depth = shape[2];
+    desc.image_array_size = shape[2];
+  }
+
+  void image_desc_set_pitches(cl_image_desc &desc, py::object py_pitches)
+  {
+    COPY_PY_PITCH_TUPLE(pitches);
+    desc.image_row_pitch = pitches[0];
+    desc.image_slice_pitch = pitches[1];
+  }
+
+  void image_desc_set_buffer(cl_image_desc &desc, memory_object *mobj)
+  {
+    if (mobj)
+      desc.buffer = mobj->data();
+    else
+      desc.buffer = 0;
+  }
+
+#endif
+}
+
+
+
+
+using namespace pyopencl;
+
+
+
+
+void pyopencl_expose_part_2(py::module &m)
+{
+  // {{{ image
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+  {
+    typedef cl_image_desc cls;
+    py::class_<cls>(m, "ImageDescriptor")
+      .def(py::init<>())
+      .def_readwrite("image_type", &cls::image_type)
+      .def_property("shape", &image_desc_dummy_getter, image_desc_set_shape)
+      .def_readwrite("array_size", &cls::image_array_size)
+      .def_property("pitches", &image_desc_dummy_getter, image_desc_set_pitches)
+      .def_readwrite("num_mip_levels", &cls::num_mip_levels)
+      .def_readwrite("num_samples", &cls::num_samples)
+      .def_property("buffer", &image_desc_dummy_getter, image_desc_set_buffer)
+      ;
+  }
+#endif
+
+  {
+    typedef image cls;
+    py::class_<cls, memory_object>(m, "Image", py::dynamic_attr())
+      .def(
+          py::init(
+            [](
+              context const &ctx,
+              cl_mem_flags flags,
+              cl_image_format const &fmt,
+              py::sequence shape,
+              py::sequence pitches,
+              py::object buffer)
+            {
+              return create_image(ctx, flags, fmt, shape, pitches, buffer);
+            }),
+          py::arg("context"),
+          py::arg("flags"),
+          py::arg("format"),
+          py::arg("shape")=py::none(),
+          py::arg("pitches")=py::none(),
+          py::arg("hostbuf")=py::none()
+          )
+#if PYOPENCL_CL_VERSION >= 0x1020
+      .def(
+          py::init(
+            [](
+              context const &ctx,
+              cl_mem_flags flags,
+              cl_image_format const &fmt,
+              cl_image_desc &desc,
+              py::object buffer)
+            {
+              return create_image_from_desc(ctx, flags, fmt, desc, buffer);
+            }),
+          py::arg("context"),
+          py::arg("flags"),
+          py::arg("format"),
+          py::arg("desc"),
+          py::arg("hostbuf")=py::none()
+          )
+#endif
+      .DEF_SIMPLE_METHOD(get_image_info)
+      ;
+  }
+
+  {
+    typedef cl_image_format cls;
+    py::class_<cls>(m, "ImageFormat")
+      .def(
+          py::init(
+            [](cl_channel_order ord, cl_channel_type tp)
+            {
+              return make_image_format(ord, tp);
+            }))
+      .def_readwrite("channel_order", &cls::image_channel_order)
+      .def_readwrite("channel_data_type", &cls::image_channel_data_type)
+      .def_property_readonly("channel_count", &get_image_format_channel_count)
+      .def_property_readonly("dtype_size", &get_image_format_channel_dtype_size)
+      .def_property_readonly("itemsize", &get_image_format_item_size)
+      ;
+  }
+
+  DEF_SIMPLE_FUNCTION(get_supported_image_formats);
+
+  m.def("_enqueue_read_image", enqueue_read_image,
+      py::arg("queue"),
+      py::arg("mem"),
+      py::arg("origin"),
+      py::arg("region"),
+      py::arg("hostbuf"),
+      py::arg("row_pitch")=0,
+      py::arg("slice_pitch")=0,
+      py::arg("wait_for")=py::none(),
+      py::arg("is_blocking")=true
+      );
+  m.def("_enqueue_write_image", enqueue_write_image,
+      py::arg("queue"),
+      py::arg("mem"),
+      py::arg("origin"),
+      py::arg("region"),
+      py::arg("hostbuf"),
+      py::arg("row_pitch")=0,
+      py::arg("slice_pitch")=0,
+      py::arg("wait_for")=py::none(),
+      py::arg("is_blocking")=true
+      );
+
+  m.def("_enqueue_copy_image", enqueue_copy_image,
+      py::arg("queue"),
+      py::arg("src"),
+      py::arg("dest"),
+      py::arg("src_origin"),
+      py::arg("dest_origin"),
+      py::arg("region"),
+      py::arg("wait_for")=py::none()
+      );
+  m.def("_enqueue_copy_image_to_buffer", enqueue_copy_image_to_buffer,
+      py::arg("queue"),
+      py::arg("src"),
+      py::arg("dest"),
+      py::arg("origin"),
+      py::arg("region"),
+      py::arg("offset"),
+      py::arg("wait_for")=py::none()
+      );
+  m.def("_enqueue_copy_buffer_to_image", enqueue_copy_buffer_to_image,
+      py::arg("queue"),
+      py::arg("src"),
+      py::arg("dest"),
+      py::arg("offset"),
+      py::arg("origin"),
+      py::arg("region"),
+      py::arg("wait_for")=py::none()
+      );
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+  m.def("enqueue_fill_image", enqueue_fill_image,
+      py::arg("queue"),
+      py::arg("mem"),
+      py::arg("color"),
+      py::arg("origin"),
+      py::arg("region"),
+      py::arg("wait_for")=py::none()
+      );
+#endif
+
+  // }}}
+
+  // {{{ memory_map
+  {
+    typedef memory_map cls;
+    py::class_<cls>(m, "MemoryMap", py::dynamic_attr())
+      .def("release", &cls::release,
+          py::arg("queue").none(true)=nullptr,
+          py::arg("wait_for")=py::none()
+          )
+      ;
+  }
+
+  // FIXME: Reenable in pypy
+#ifndef PYPY_VERSION
+  m.def("enqueue_map_buffer", enqueue_map_buffer,
+      py::arg("queue"),
+      py::arg("buf"),
+      py::arg("flags"),
+      py::arg("offset"),
+      py::arg("shape"),
+      py::arg("dtype"),
+      py::arg("order")="C",
+      py::arg("strides")=py::none(),
+      py::arg("wait_for")=py::none(),
+      py::arg("is_blocking")=true);
+  m.def("enqueue_map_image", enqueue_map_image,
+      py::arg("queue"),
+      py::arg("img"),
+      py::arg("flags"),
+      py::arg("origin"),
+      py::arg("region"),
+      py::arg("shape"),
+      py::arg("dtype"),
+      py::arg("order")="C",
+      py::arg("strides")=py::none(),
+      py::arg("wait_for")=py::none(),
+      py::arg("is_blocking")=true);
+#endif
+
+  // }}}
+
+  // {{{ svm
+
+#if PYOPENCL_CL_VERSION >= 0x2000
+  {
+    typedef svm_arg_wrapper cls;
+    py::class_<cls>(m, "SVM", py::dynamic_attr())
+      .def(py::init<py::object>())
+      ;
+  }
+
+  {
+    typedef svm_allocation cls;
+    py::class_<cls>(m, "SVMAllocation", py::dynamic_attr())
+      .def(py::init<std::shared_ptr<context>, size_t, cl_uint, cl_svm_mem_flags>())
+      .DEF_SIMPLE_METHOD(release)
+      .def("enqueue_release", &cls::enqueue_release,
+          ":returns: a :class:`pyopencl.Event`\n\n"
+          "|std-enqueue-blurb|")
+      .def("_ptr_as_int", &cls::ptr_as_int)
+      .def(py::self == py::self)
+      .def(py::self != py::self)
+      .def("__hash__", &cls::ptr_as_int)
+      ;
+  }
+
+  m.def("_enqueue_svm_memcpyw", enqueue_svm_memcpy,
+      py::arg("queue"),
+      py::arg("is_blocking"),
+      py::arg("dst"),
+      py::arg("src"),
+      py::arg("wait_for")=py::none()
+      );
+
+  m.def("_enqueue_svm_memfill", enqueue_svm_memfill,
+      py::arg("queue"),
+      py::arg("dst"),
+      py::arg("pattern"),
+      py::arg("byte_count")=py::none(),
+      py::arg("wait_for")=py::none()
+      );
+
+  m.def("_enqueue_svm_map", enqueue_svm_map,
+      py::arg("queue"),
+      py::arg("is_blocking"),
+      py::arg("flags"),
+      py::arg("svm"),
+      py::arg("wait_for")=py::none()
+      );
+
+  m.def("_enqueue_svm_unmap", enqueue_svm_unmap,
+      py::arg("queue"),
+      py::arg("svm"),
+      py::arg("wait_for")=py::none()
+      );
+#endif
+
+#if PYOPENCL_CL_VERSION >= 0x2010
+  m.def("_enqueue_svm_migrate_mem", enqueue_svm_migratemem,
+      py::arg("queue"),
+      py::arg("svms"),
+      py::arg("flags")=py::none(),
+      py::arg("wait_for")=py::none()
+      );
+#endif
+
+  // }}}
+
+  // {{{ sampler
+  {
+    typedef sampler cls;
+    py::class_<cls>(m, "Sampler", py::dynamic_attr())
+#if PYOPENCL_CL_VERSION >= 0x2000
+      .def(py::init<context const &, py::sequence>())
+#endif
+      .def(py::init<context const &, bool, cl_addressing_mode, cl_filter_mode>())
+      .DEF_SIMPLE_METHOD(get_info)
+      .def(py::self == py::self)
+      .def(py::self != py::self)
+      .def("__hash__", &cls::hash)
+      PYOPENCL_EXPOSE_TO_FROM_INT_PTR(cl_sampler)
+      ;
+  }
+
+  // }}}
+
+  // {{{ program
+  {
+    typedef program cls;
+    py::enum_<cls::program_kind_type>(m, "program_kind")
+      .value("UNKNOWN", cls::KND_UNKNOWN)
+      .value("SOURCE", cls::KND_SOURCE)
+      .value("BINARY", cls::KND_BINARY)
+      ;
+
+    py::class_<cls>(m, "_Program", py::dynamic_attr())
+      .def(
+          py::init(
+            [](context &ctx, std::string const &src)
+            {
+              return create_program_with_source(ctx, src);
+            }),
+          py::arg("context"),
+          py::arg("src"))
+      .def(
+          py::init(
+            [](context &ctx, py::sequence devices, py::sequence binaries)
+            {
+              return create_program_with_binary(ctx, devices, binaries);
+            }),
+          py::arg("context"),
+          py::arg("devices"),
+          py::arg("binaries"))
+#if (PYOPENCL_CL_VERSION >= 0x1020) && \
+      ((PYOPENCL_CL_VERSION >= 0x1030) && defined(__APPLE__))
+      .def_static("create_with_built_in_kernels",
+          create_program_with_built_in_kernels,
+          py::arg("context"),
+          py::arg("devices"),
+          py::arg("kernel_names"),
+          py::return_value_policy<py::manage_new_object>())
+#endif
+      .DEF_SIMPLE_METHOD(kind)
+      .DEF_SIMPLE_METHOD(get_info)
+      .DEF_SIMPLE_METHOD(get_build_info)
+      .def("_build", &cls::build,
+          py::arg("options")="",
+          py::arg("devices")=py::none())
+#if PYOPENCL_CL_VERSION >= 0x1020
+      .def("compile", &cls::compile,
+          py::arg("options")="",
+          py::arg("devices")=py::none(),
+          py::arg("headers")=py::list())
+      .def_static("link", &link_program,
+          py::arg("context"),
+          py::arg("programs"),
+          py::arg("options")="",
+          py::arg("devices")=py::none()
+          )
+#endif
+      .def(py::self == py::self)
+      .def(py::self != py::self)
+      .def("__hash__", &cls::hash)
+      .def("all_kernels", create_kernels_in_program)
+      PYOPENCL_EXPOSE_TO_FROM_INT_PTR(cl_program)
+      ;
+  }
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+  m.def("unload_platform_compiler", unload_platform_compiler);
+#endif
+
+  // }}}
+
+  // {{{ kernel
+
+  {
+    typedef kernel cls;
+    py::class_<cls>(m, "Kernel", py::dynamic_attr())
+      .def(py::init<const program &, std::string const &>())
+      .DEF_SIMPLE_METHOD(get_info)
+      .DEF_SIMPLE_METHOD(get_work_group_info)
+      .def("_set_arg_null", &cls::set_arg_null)
+      .def("_set_arg_buf", &cls::set_arg_buf)
+#if PYOPENCL_CL_VERSION >= 0x2000
+      .def("_set_arg_svm", &cls::set_arg_svm)
+#endif
+      .DEF_SIMPLE_METHOD(set_arg)
+#if PYOPENCL_CL_VERSION >= 0x1020
+      .DEF_SIMPLE_METHOD(get_arg_info)
+#endif
+      .def(py::self == py::self)
+      .def(py::self != py::self)
+      .def("__hash__", &cls::hash)
+      PYOPENCL_EXPOSE_TO_FROM_INT_PTR(cl_kernel)
+      ;
+  }
+
+  {
+    typedef local_memory cls;
+    py::class_<cls>(m, "LocalMemory", py::dynamic_attr())
+      .def(
+          py::init<size_t>(),
+          py::arg("size"))
+      .def_property_readonly("size", &cls::size)
+      ;
+  }
+
+
+  m.def("enqueue_nd_range_kernel", enqueue_nd_range_kernel,
+      py::arg("queue"),
+      py::arg("kernel"),
+      py::arg("global_work_size"),
+      py::arg("local_work_size"),
+      py::arg("global_work_offset")=py::none(),
+      py::arg("wait_for")=py::none(),
+      py::arg("g_times_l")=false
+      );
+
+  // TODO: clEnqueueNativeKernel
+  // }}}
+
+  // {{{ GL interop
+  DEF_SIMPLE_FUNCTION(have_gl);
+
+#ifdef HAVE_GL
+
+#ifdef __APPLE__
+  DEF_SIMPLE_FUNCTION(get_apple_cgl_share_group);
+#endif /* __APPLE__ */
+
+  {
+    typedef gl_buffer cls;
+    py::class_<cls, memory_object>(m, "GLBuffer", py::dynamic_attr())
+      .def(
+          py::init(
+            [](context &ctx, cl_mem_flags flags, GLuint bufobj)
+            {
+              return create_from_gl_buffer(ctx, flags, bufobj);
+            }),
+          py::arg("context"),
+          py::arg("flags"),
+          py::arg("bufobj"))
+      .def("get_gl_object_info", get_gl_object_info)
+      ;
+  }
+
+  {
+    typedef gl_renderbuffer cls;
+    py::class_<cls, memory_object>(m, "GLRenderBuffer", py::dynamic_attr())
+      .def(
+          py::init(
+            [](context &ctx, cl_mem_flags flags, GLuint bufobj)
+            {
+              return create_from_gl_renderbuffer(ctx, flags, bufobj);
+            }),
+          py::arg("context"),
+          py::arg("flags"),
+          py::arg("bufobj"))
+      .def("get_gl_object_info", get_gl_object_info)
+      ;
+  }
+
+  {
+    typedef gl_texture cls;
+    py::class_<cls, image>(m, "GLTexture", py::dynamic_attr())
+      .def(
+          py::init(
+            [](context &ctx, cl_mem_flags flags, GLenum texture_target,
+              GLint miplevel, GLuint texture, unsigned dims)
+            {
+              return create_from_gl_texture(ctx, flags, texture_target, miplevel, texture, dims);
+            }),
+          py::arg("context"),
+          py::arg("flags"),
+          py::arg("texture_target"),
+          py::arg("miplevel"),
+          py::arg("texture"),
+          py::arg("dims"))
+      .def("get_gl_object_info", get_gl_object_info)
+      .DEF_SIMPLE_METHOD(get_gl_texture_info)
+      ;
+  }
+
+  m.def("enqueue_acquire_gl_objects", enqueue_acquire_gl_objects,
+      py::arg("queue"),
+      py::arg("mem_objects"),
+      py::arg("wait_for")=py::none()
+      );
+  m.def("enqueue_release_gl_objects", enqueue_release_gl_objects,
+      py::arg("queue"),
+      py::arg("mem_objects"),
+      py::arg("wait_for")=py::none()
+      );
+
+#if defined(cl_khr_gl_sharing) && (cl_khr_gl_sharing >= 1)
+  m.def("get_gl_context_info_khr", get_gl_context_info_khr,
+      py::arg("properties"),
+      py::arg("param_name"),
+      py::arg("platform")=py::none()
+      );
+#endif
+
+#endif
+  // }}}
+
+  // {{{ CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD
+
+  {
+    typedef cl_device_topology_amd cls;
+    py::class_<cls>(m, "DeviceTopologyAmd")
+      .def(py::init(
+            [](cl_char bus, cl_char device, cl_char function)
+            {
+              cl_device_topology_amd result;
+              result.pcie.bus = bus;
+              result.pcie.device = device;
+              result.pcie.function = function;
+              return result;
+            }),
+          py::arg("bus")=0,
+          py::arg("device")=0,
+          py::arg("function")=0)
+
+      .def_property("type",
+          [](cls &t) { return t.pcie.type; },
+          [](cls &t, cl_uint val) { t.pcie.type = val; })
+
+      .def_property("bus",
+          [](cls &t) { return t.pcie.bus; },
+          [](cls &t, cl_char val) { t.pcie.bus = val; })
+      .def_property("device",
+          [](cls &t) { return t.pcie.device; },
+          [](cls &t, cl_char val) { t.pcie.device = val; })
+      .def_property("function",
+          [](cls &t) { return t.pcie.function; },
+          [](cls &t, cl_char val) { t.pcie.function = val; })
+      ;
+  }
+
+  // }}}
+}
+
+
+// vim: foldmethod=marker
diff --git a/src/wrap_constants.cpp b/src/wrap_constants.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7b6a97f16fbf2083534d8175b0395981461c0b70
--- /dev/null
+++ b/src/wrap_constants.cpp
@@ -0,0 +1,988 @@
+#include "wrap_cl.hpp"
+
+
+using namespace pyopencl;
+
+
+namespace
+{
+  // {{{ 'fake' constant scopes
+  class status_code { };
+  class platform_info { };
+  class device_type { };
+  class device_info { };
+  class device_fp_config { };
+  class device_mem_cache_type { };
+  class device_local_mem_type { };
+  class device_exec_capabilities { };
+  class device_svm_capabilities { };
+  class command_queue_properties { };
+  class context_info { };
+  class gl_context_info { };
+  class context_properties { };
+  class command_queue_info { };
+  class queue_properties { };
+  class mem_flags { };
+  class svm_mem_flags { };
+  class channel_order { };
+  class channel_type { };
+  class mem_object_type { };
+  class mem_info { };
+  class image_info { };
+  class addressing_mode { };
+  class filter_mode { };
+  class sampler_info { };
+  class map_flags { };
+  class program_info { };
+  class program_build_info { };
+  class program_binary_type { };
+  class build_status { };
+  class kernel_info { };
+  class kernel_arg_info { };
+  class kernel_arg_address_qualifier { };
+  class kernel_arg_access_qualifier { };
+  class kernel_arg_type_qualifier { };
+  class kernel_work_group_info { };
+  class event_info { };
+  class command_type { };
+  class command_execution_status { };
+  class profiling_info { };
+  class buffer_create_type { };
+  class mem_migration_flags { };
+
+  class device_partition_property { };
+  class device_affinity_domain { };
+
+  class gl_object_type { };
+  class gl_texture_info { };
+
+  // }}}
+}
+
+
+void pyopencl_expose_constants(py::module &m)
+{
+  // {{{ exceptions
+  {
+#define DECLARE_EXC(NAME, BASE) \
+  static py::exception<pyopencl::error> CL##NAME(m, #NAME, BASE);
+
+    DECLARE_EXC(Error, NULL);
+    DECLARE_EXC(MemoryError, CLError.ptr());
+    DECLARE_EXC(LogicError, CLError.ptr());
+    DECLARE_EXC(RuntimeError, CLError.ptr());
+
+    py::register_exception_translator(
+        [](std::exception_ptr p)
+        {
+          try
+          {
+            if (p) std::rethrow_exception(p);
+          }
+          catch (pyopencl::error &err)
+          {
+            py::object err_obj = py::cast(err);
+            if (err.code() == CL_MEM_OBJECT_ALLOCATION_FAILURE)
+              PyErr_SetObject(CLMemoryError.ptr(), err_obj.ptr());
+            else if (err.code() <= CL_INVALID_VALUE)
+              PyErr_SetObject(CLLogicError.ptr(), err_obj.ptr());
+            else if (err.code() > CL_INVALID_VALUE && err.code() < CL_SUCCESS)
+              PyErr_SetObject(CLRuntimeError.ptr(), err_obj.ptr());
+            else
+              PyErr_SetObject(CLError.ptr(), err_obj.ptr());
+          }
+        });
+  }
+  // }}}
+
+  // {{{ error record
+
+  {
+    typedef error cls;
+    py::class_<error> (m, "_ErrorRecord")
+      .def(py::init<const char *, cl_int, const char *>(),
+          py::arg("routine"),
+          py::arg("code"),
+          py::arg("msg"))
+      .DEF_SIMPLE_METHOD(routine)
+      .DEF_SIMPLE_METHOD(code)
+      .DEF_SIMPLE_METHOD(what)
+      .DEF_SIMPLE_METHOD(is_out_of_memory)
+      ;
+  }
+
+  // }}}
+
+  // {{{ constants
+#define ADD_ATTR(PREFIX, NAME) \
+  cls.attr(#NAME) = CL_##PREFIX##NAME
+#define ADD_ATTR_SUFFIX(PREFIX, NAME, SUFFIX) \
+  cls.attr(#NAME) = CL_##PREFIX##NAME##SUFFIX
+
+  {
+    py::class_<status_code> cls(m, "status_code");
+
+    ADD_ATTR(, SUCCESS);
+    ADD_ATTR(, DEVICE_NOT_FOUND);
+    ADD_ATTR(, DEVICE_NOT_AVAILABLE);
+#if !(defined(CL_PLATFORM_NVIDIA) && CL_PLATFORM_NVIDIA == 0x3001)
+    ADD_ATTR(, COMPILER_NOT_AVAILABLE);
+#endif
+    ADD_ATTR(, MEM_OBJECT_ALLOCATION_FAILURE);
+    ADD_ATTR(, OUT_OF_RESOURCES);
+    ADD_ATTR(, OUT_OF_HOST_MEMORY);
+    ADD_ATTR(, PROFILING_INFO_NOT_AVAILABLE);
+    ADD_ATTR(, MEM_COPY_OVERLAP);
+    ADD_ATTR(, IMAGE_FORMAT_MISMATCH);
+    ADD_ATTR(, IMAGE_FORMAT_NOT_SUPPORTED);
+    ADD_ATTR(, BUILD_PROGRAM_FAILURE);
+    ADD_ATTR(, MAP_FAILURE);
+
+    ADD_ATTR(, INVALID_VALUE);
+    ADD_ATTR(, INVALID_DEVICE_TYPE);
+    ADD_ATTR(, INVALID_PLATFORM);
+    ADD_ATTR(, INVALID_DEVICE);
+    ADD_ATTR(, INVALID_CONTEXT);
+    ADD_ATTR(, INVALID_QUEUE_PROPERTIES);
+    ADD_ATTR(, INVALID_COMMAND_QUEUE);
+    ADD_ATTR(, INVALID_HOST_PTR);
+    ADD_ATTR(, INVALID_MEM_OBJECT);
+    ADD_ATTR(, INVALID_IMAGE_FORMAT_DESCRIPTOR);
+    ADD_ATTR(, INVALID_IMAGE_SIZE);
+    ADD_ATTR(, INVALID_SAMPLER);
+    ADD_ATTR(, INVALID_BINARY);
+    ADD_ATTR(, INVALID_BUILD_OPTIONS);
+    ADD_ATTR(, INVALID_PROGRAM);
+    ADD_ATTR(, INVALID_PROGRAM_EXECUTABLE);
+    ADD_ATTR(, INVALID_KERNEL_NAME);
+    ADD_ATTR(, INVALID_KERNEL_DEFINITION);
+    ADD_ATTR(, INVALID_KERNEL);
+    ADD_ATTR(, INVALID_ARG_INDEX);
+    ADD_ATTR(, INVALID_ARG_VALUE);
+    ADD_ATTR(, INVALID_ARG_SIZE);
+    ADD_ATTR(, INVALID_KERNEL_ARGS);
+    ADD_ATTR(, INVALID_WORK_DIMENSION);
+    ADD_ATTR(, INVALID_WORK_GROUP_SIZE);
+    ADD_ATTR(, INVALID_WORK_ITEM_SIZE);
+    ADD_ATTR(, INVALID_GLOBAL_OFFSET);
+    ADD_ATTR(, INVALID_EVENT_WAIT_LIST);
+    ADD_ATTR(, INVALID_EVENT);
+    ADD_ATTR(, INVALID_OPERATION);
+    ADD_ATTR(, INVALID_GL_OBJECT);
+    ADD_ATTR(, INVALID_BUFFER_SIZE);
+    ADD_ATTR(, INVALID_MIP_LEVEL);
+
+#if defined(cl_khr_icd) && (cl_khr_icd >= 1)
+    ADD_ATTR(, PLATFORM_NOT_FOUND_KHR);
+#endif
+
+#if defined(cl_khr_gl_sharing) && (cl_khr_gl_sharing >= 1)
+    ADD_ATTR(, INVALID_GL_SHAREGROUP_REFERENCE_KHR);
+#endif
+
+#if PYOPENCL_CL_VERSION >= 0x1010
+    ADD_ATTR(, MISALIGNED_SUB_BUFFER_OFFSET);
+    ADD_ATTR(, EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
+    ADD_ATTR(, INVALID_GLOBAL_WORK_SIZE);
+#endif
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(, COMPILE_PROGRAM_FAILURE);
+    ADD_ATTR(, LINKER_NOT_AVAILABLE);
+    ADD_ATTR(, LINK_PROGRAM_FAILURE);
+    ADD_ATTR(, DEVICE_PARTITION_FAILED);
+    ADD_ATTR(, KERNEL_ARG_INFO_NOT_AVAILABLE);
+    ADD_ATTR(, INVALID_IMAGE_DESCRIPTOR);
+    ADD_ATTR(, INVALID_COMPILER_OPTIONS);
+    ADD_ATTR(, INVALID_LINKER_OPTIONS);
+    ADD_ATTR(, INVALID_DEVICE_PARTITION_COUNT);
+#endif
+
+#if PYOPENCL_CL_VERSION >= 0x2000
+    ADD_ATTR(, INVALID_PIPE_SIZE);
+    ADD_ATTR(, INVALID_DEVICE_QUEUE);
+#endif
+
+#if defined(cl_ext_device_fission) && defined(PYOPENCL_USE_DEVICE_FISSION)
+    ADD_ATTR(, DEVICE_PARTITION_FAILED_EXT);
+    ADD_ATTR(, INVALID_PARTITION_COUNT_EXT);
+    ADD_ATTR(, INVALID_PARTITION_NAME_EXT);
+#endif
+  }
+
+  {
+    py::class_<platform_info> cls(m, "platform_info");
+    ADD_ATTR(PLATFORM_, PROFILE);
+    ADD_ATTR(PLATFORM_, VERSION);
+    ADD_ATTR(PLATFORM_, NAME);
+    ADD_ATTR(PLATFORM_, VENDOR);
+#if !(defined(CL_PLATFORM_NVIDIA) && CL_PLATFORM_NVIDIA == 0x3001)
+    ADD_ATTR(PLATFORM_, EXTENSIONS);
+#endif
+  }
+
+  {
+    py::class_<device_type> cls(m, "device_type");
+    ADD_ATTR(DEVICE_TYPE_, DEFAULT);
+    ADD_ATTR(DEVICE_TYPE_, CPU);
+    ADD_ATTR(DEVICE_TYPE_, GPU);
+    ADD_ATTR(DEVICE_TYPE_, ACCELERATOR);
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(DEVICE_TYPE_, CUSTOM);
+#endif
+    ADD_ATTR(DEVICE_TYPE_, ALL);
+  }
+
+  {
+    py::class_<device_info> cls(m, "device_info");
+    ADD_ATTR(DEVICE_, TYPE);
+    ADD_ATTR(DEVICE_, VENDOR_ID);
+    ADD_ATTR(DEVICE_, MAX_COMPUTE_UNITS);
+    ADD_ATTR(DEVICE_, MAX_WORK_ITEM_DIMENSIONS);
+    ADD_ATTR(DEVICE_, MAX_WORK_GROUP_SIZE);
+    ADD_ATTR(DEVICE_, MAX_WORK_ITEM_SIZES);
+    ADD_ATTR(DEVICE_, PREFERRED_VECTOR_WIDTH_CHAR);
+    ADD_ATTR(DEVICE_, PREFERRED_VECTOR_WIDTH_SHORT);
+    ADD_ATTR(DEVICE_, PREFERRED_VECTOR_WIDTH_INT);
+    ADD_ATTR(DEVICE_, PREFERRED_VECTOR_WIDTH_LONG);
+    ADD_ATTR(DEVICE_, PREFERRED_VECTOR_WIDTH_FLOAT);
+    ADD_ATTR(DEVICE_, PREFERRED_VECTOR_WIDTH_DOUBLE);
+    ADD_ATTR(DEVICE_, MAX_CLOCK_FREQUENCY);
+    ADD_ATTR(DEVICE_, ADDRESS_BITS);
+    ADD_ATTR(DEVICE_, MAX_READ_IMAGE_ARGS);
+    ADD_ATTR(DEVICE_, MAX_WRITE_IMAGE_ARGS);
+    ADD_ATTR(DEVICE_, MAX_MEM_ALLOC_SIZE);
+    ADD_ATTR(DEVICE_, IMAGE2D_MAX_WIDTH);
+    ADD_ATTR(DEVICE_, IMAGE2D_MAX_HEIGHT);
+    ADD_ATTR(DEVICE_, IMAGE3D_MAX_WIDTH);
+    ADD_ATTR(DEVICE_, IMAGE3D_MAX_HEIGHT);
+    ADD_ATTR(DEVICE_, IMAGE3D_MAX_DEPTH);
+    ADD_ATTR(DEVICE_, IMAGE_SUPPORT);
+    ADD_ATTR(DEVICE_, MAX_PARAMETER_SIZE);
+    ADD_ATTR(DEVICE_, MAX_SAMPLERS);
+    ADD_ATTR(DEVICE_, MEM_BASE_ADDR_ALIGN);
+    ADD_ATTR(DEVICE_, MIN_DATA_TYPE_ALIGN_SIZE);
+    ADD_ATTR(DEVICE_, SINGLE_FP_CONFIG);
+#ifdef CL_DEVICE_DOUBLE_FP_CONFIG
+    ADD_ATTR(DEVICE_, DOUBLE_FP_CONFIG);
+#endif
+#ifdef CL_DEVICE_HALF_FP_CONFIG
+    ADD_ATTR(DEVICE_, HALF_FP_CONFIG);
+#endif
+    ADD_ATTR(DEVICE_, GLOBAL_MEM_CACHE_TYPE);
+    ADD_ATTR(DEVICE_, GLOBAL_MEM_CACHELINE_SIZE);
+    ADD_ATTR(DEVICE_, GLOBAL_MEM_CACHE_SIZE);
+    ADD_ATTR(DEVICE_, GLOBAL_MEM_SIZE);
+    ADD_ATTR(DEVICE_, MAX_CONSTANT_BUFFER_SIZE);
+    ADD_ATTR(DEVICE_, MAX_CONSTANT_ARGS);
+    ADD_ATTR(DEVICE_, LOCAL_MEM_TYPE);
+    ADD_ATTR(DEVICE_, LOCAL_MEM_SIZE);
+    ADD_ATTR(DEVICE_, ERROR_CORRECTION_SUPPORT);
+    ADD_ATTR(DEVICE_, PROFILING_TIMER_RESOLUTION);
+    ADD_ATTR(DEVICE_, ENDIAN_LITTLE);
+    ADD_ATTR(DEVICE_, AVAILABLE);
+    ADD_ATTR(DEVICE_, COMPILER_AVAILABLE);
+    ADD_ATTR(DEVICE_, EXECUTION_CAPABILITIES);
+    ADD_ATTR(DEVICE_, QUEUE_PROPERTIES);
+#if PYOPENCL_CL_VERSION >= 0x2000
+    ADD_ATTR(DEVICE_, QUEUE_ON_HOST_PROPERTIES);
+#endif
+    ADD_ATTR(DEVICE_, NAME);
+    ADD_ATTR(DEVICE_, VENDOR);
+    ADD_ATTR(, DRIVER_VERSION);
+    ADD_ATTR(DEVICE_, VERSION);
+    ADD_ATTR(DEVICE_, PROFILE);
+    ADD_ATTR(DEVICE_, VERSION);
+    ADD_ATTR(DEVICE_, EXTENSIONS);
+    ADD_ATTR(DEVICE_, PLATFORM);
+#if PYOPENCL_CL_VERSION >= 0x1010
+    ADD_ATTR(DEVICE_, PREFERRED_VECTOR_WIDTH_HALF);
+    ADD_ATTR(DEVICE_, HOST_UNIFIED_MEMORY);
+    ADD_ATTR(DEVICE_, NATIVE_VECTOR_WIDTH_CHAR);
+    ADD_ATTR(DEVICE_, NATIVE_VECTOR_WIDTH_SHORT);
+    ADD_ATTR(DEVICE_, NATIVE_VECTOR_WIDTH_INT);
+    ADD_ATTR(DEVICE_, NATIVE_VECTOR_WIDTH_LONG);
+    ADD_ATTR(DEVICE_, NATIVE_VECTOR_WIDTH_FLOAT);
+    ADD_ATTR(DEVICE_, NATIVE_VECTOR_WIDTH_DOUBLE);
+    ADD_ATTR(DEVICE_, NATIVE_VECTOR_WIDTH_HALF);
+    ADD_ATTR(DEVICE_, OPENCL_C_VERSION);
+#endif
+// support for cl_nv_device_attribute_query
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+    ADD_ATTR(DEVICE_, COMPUTE_CAPABILITY_MAJOR_NV);
+    ADD_ATTR(DEVICE_, COMPUTE_CAPABILITY_MINOR_NV);
+    ADD_ATTR(DEVICE_, REGISTERS_PER_BLOCK_NV);
+    ADD_ATTR(DEVICE_, WARP_SIZE_NV);
+    ADD_ATTR(DEVICE_, GPU_OVERLAP_NV);
+    ADD_ATTR(DEVICE_, KERNEL_EXEC_TIMEOUT_NV);
+    ADD_ATTR(DEVICE_, INTEGRATED_MEMORY_NV);
+    // Nvidia specific device attributes, not defined in Khronos CL/cl_ext.h
+#ifdef CL_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT_NV
+    ADD_ATTR(DEVICE_, ATTRIBUTE_ASYNC_ENGINE_COUNT_NV);
+#endif
+#ifdef CL_DEVICE_PCI_BUS_ID_NV
+    ADD_ATTR(DEVICE_, PCI_BUS_ID_NV);
+#endif
+#ifdef CL_DEVICE_PCI_SLOT_ID_NV
+    ADD_ATTR(DEVICE_, PCI_SLOT_ID_NV);
+#endif
+#endif
+// {{{ cl_amd_device_attribute_query
+#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
+    ADD_ATTR(DEVICE_, PROFILING_TIMER_OFFSET_AMD);
+#endif
+#ifdef CL_DEVICE_TOPOLOGY_AMD
+    ADD_ATTR(DEVICE_, TOPOLOGY_AMD);
+#endif
+#ifdef CL_DEVICE_BOARD_NAME_AMD
+    ADD_ATTR(DEVICE_, BOARD_NAME_AMD);
+#endif
+#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
+    ADD_ATTR(DEVICE_, GLOBAL_FREE_MEMORY_AMD);
+#endif
+#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
+    ADD_ATTR(DEVICE_, SIMD_PER_COMPUTE_UNIT_AMD);
+#endif
+#ifdef CL_DEVICE_SIMD_WIDTH_AMD
+    ADD_ATTR(DEVICE_, SIMD_WIDTH_AMD);
+#endif
+#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
+    ADD_ATTR(DEVICE_, SIMD_INSTRUCTION_WIDTH_AMD);
+#endif
+#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
+    ADD_ATTR(DEVICE_, WAVEFRONT_WIDTH_AMD);
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
+    ADD_ATTR(DEVICE_, GLOBAL_MEM_CHANNELS_AMD);
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
+    ADD_ATTR(DEVICE_, GLOBAL_MEM_CHANNEL_BANKS_AMD);
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
+    ADD_ATTR(DEVICE_, GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD);
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
+    ADD_ATTR(DEVICE_, LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD);
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
+    ADD_ATTR(DEVICE_, LOCAL_MEM_BANKS_AMD);
+#endif
+// }}}
+#ifdef CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD
+    ADD_ATTR(DEVICE_, THREAD_TRACE_SUPPORTED_AMD);
+#endif
+#ifdef CL_DEVICE_GFXIP_MAJOR_AMD
+    ADD_ATTR(DEVICE_, GFXIP_MAJOR_AMD);
+#endif
+#ifdef CL_DEVICE_GFXIP_MINOR_AMD
+    ADD_ATTR(DEVICE_, GFXIP_MINOR_AMD);
+#endif
+#ifdef CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD
+    ADD_ATTR(DEVICE_, AVAILABLE_ASYNC_QUEUES_AMD);
+#endif
+
+#ifdef CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT
+    ADD_ATTR(DEVICE_, MAX_ATOMIC_COUNTERS_EXT);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(DEVICE_, LINKER_AVAILABLE);
+    ADD_ATTR(DEVICE_, BUILT_IN_KERNELS);
+    ADD_ATTR(DEVICE_, IMAGE_MAX_BUFFER_SIZE);
+    ADD_ATTR(DEVICE_, IMAGE_MAX_ARRAY_SIZE);
+    ADD_ATTR(DEVICE_, PARENT_DEVICE);
+    ADD_ATTR(DEVICE_, PARTITION_MAX_SUB_DEVICES);
+    ADD_ATTR(DEVICE_, PARTITION_PROPERTIES);
+    ADD_ATTR(DEVICE_, PARTITION_AFFINITY_DOMAIN);
+    ADD_ATTR(DEVICE_, PARTITION_TYPE);
+    ADD_ATTR(DEVICE_, REFERENCE_COUNT);
+    ADD_ATTR(DEVICE_, PREFERRED_INTEROP_USER_SYNC);
+    ADD_ATTR(DEVICE_, PRINTF_BUFFER_SIZE);
+#endif
+#ifdef cl_khr_image2d_from_buffer
+    ADD_ATTR(DEVICE_, IMAGE_PITCH_ALIGNMENT);
+    ADD_ATTR(DEVICE_, IMAGE_BASE_ADDRESS_ALIGNMENT);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2000
+    ADD_ATTR(DEVICE_, MAX_READ_WRITE_IMAGE_ARGS);
+    ADD_ATTR(DEVICE_, MAX_GLOBAL_VARIABLE_SIZE);
+    ADD_ATTR(DEVICE_, QUEUE_ON_DEVICE_PROPERTIES);
+    ADD_ATTR(DEVICE_, QUEUE_ON_DEVICE_PREFERRED_SIZE);
+    ADD_ATTR(DEVICE_, QUEUE_ON_DEVICE_MAX_SIZE);
+    ADD_ATTR(DEVICE_, MAX_ON_DEVICE_QUEUES);
+    ADD_ATTR(DEVICE_, MAX_ON_DEVICE_EVENTS);
+    ADD_ATTR(DEVICE_, SVM_CAPABILITIES);
+    ADD_ATTR(DEVICE_, GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE);
+    ADD_ATTR(DEVICE_, MAX_PIPE_ARGS);
+    ADD_ATTR(DEVICE_, PIPE_MAX_ACTIVE_RESERVATIONS);
+    ADD_ATTR(DEVICE_, PIPE_MAX_PACKET_SIZE);
+    ADD_ATTR(DEVICE_, PREFERRED_PLATFORM_ATOMIC_ALIGNMENT);
+    ADD_ATTR(DEVICE_, PREFERRED_GLOBAL_ATOMIC_ALIGNMENT);
+    ADD_ATTR(DEVICE_, PREFERRED_LOCAL_ATOMIC_ALIGNMENT);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2010
+    ADD_ATTR(DEVICE_, IL_VERSION);
+    ADD_ATTR(DEVICE_, MAX_NUM_SUB_GROUPS);
+    ADD_ATTR(DEVICE_, SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS);
+#endif
+    /* cl_intel_advanced_motion_estimation */
+#ifdef CL_DEVICE_ME_VERSION_INTEL
+    ADD_ATTR(DEVICE_, ME_VERSION_INTEL);
+#endif
+
+    /* cl_qcom_ext_host_ptr */
+#ifdef CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM
+    ADD_ATTR(DEVICE_, EXT_MEM_PADDING_IN_BYTES_QCOM);
+#endif
+#ifdef CL_DEVICE_PAGE_SIZE_QCOM
+    ADD_ATTR(DEVICE_, PAGE_SIZE_QCOM);
+#endif
+
+    /* cl_khr_spir */
+#ifdef CL_DEVICE_SPIR_VERSIONS
+    ADD_ATTR(DEVICE_, SPIR_VERSIONS);
+#endif
+
+    /* cl_altera_device_temperature */
+#ifdef CL_DEVICE_CORE_TEMPERATURE_ALTERA
+    ADD_ATTR(DEVICE_, CORE_TEMPERATURE_ALTERA);
+#endif
+
+    /* cl_intel_simultaneous_sharing */
+#ifdef CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL
+    ADD_ATTR(DEVICE_, SIMULTANEOUS_INTEROPS_INTEL);
+#endif
+#ifdef CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL
+    ADD_ATTR(DEVICE_, NUM_SIMULTANEOUS_INTEROPS_INTEL);
+#endif
+  }
+
+  {
+    py::class_<device_fp_config> cls(m, "device_fp_config");
+    ADD_ATTR(FP_, DENORM);
+    ADD_ATTR(FP_, INF_NAN);
+    ADD_ATTR(FP_, ROUND_TO_NEAREST);
+    ADD_ATTR(FP_, ROUND_TO_ZERO);
+    ADD_ATTR(FP_, ROUND_TO_INF);
+    ADD_ATTR(FP_, FMA);
+#if PYOPENCL_CL_VERSION >= 0x1010
+    ADD_ATTR(FP_, SOFT_FLOAT);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(FP_, CORRECTLY_ROUNDED_DIVIDE_SQRT);
+#endif
+  }
+
+  {
+    py::class_<device_mem_cache_type> cls(m, "device_mem_cache_type");
+    ADD_ATTR( , NONE);
+    ADD_ATTR( , READ_ONLY_CACHE);
+    ADD_ATTR( , READ_WRITE_CACHE);
+  }
+
+  {
+    py::class_<device_local_mem_type> cls(m, "device_local_mem_type");
+    ADD_ATTR( , LOCAL);
+    ADD_ATTR( , GLOBAL);
+  }
+
+  {
+    py::class_<device_exec_capabilities> cls(m, "device_exec_capabilities");
+    ADD_ATTR(EXEC_, KERNEL);
+    ADD_ATTR(EXEC_, NATIVE_KERNEL);
+#ifdef CL_EXEC_IMMEDIATE_EXECUTION_INTEL
+    ADD_ATTR(EXEC_, IMMEDIATE_EXECUTION_INTEL);
+#endif
+  }
+
+  {
+    py::class_<device_svm_capabilities> cls(m, "device_svm_capabilities");
+#if PYOPENCL_CL_VERSION >= 0x2000
+    // device_svm_capabilities
+    ADD_ATTR(DEVICE_SVM_, COARSE_GRAIN_BUFFER);
+    ADD_ATTR(DEVICE_SVM_, FINE_GRAIN_BUFFER);
+    ADD_ATTR(DEVICE_SVM_, FINE_GRAIN_SYSTEM);
+    ADD_ATTR(DEVICE_SVM_, ATOMICS);
+#endif
+  }
+
+  {
+    py::class_<command_queue_properties> cls(m, "command_queue_properties");
+    ADD_ATTR(QUEUE_, OUT_OF_ORDER_EXEC_MODE_ENABLE);
+    ADD_ATTR(QUEUE_, PROFILING_ENABLE);
+#ifdef CL_QUEUE_IMMEDIATE_EXECUTION_ENABLE_INTEL
+    ADD_ATTR(QUEUE_, IMMEDIATE_EXECUTION_ENABLE_INTEL);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2000
+    ADD_ATTR(QUEUE_, ON_DEVICE);
+    ADD_ATTR(QUEUE_, ON_DEVICE_DEFAULT);
+#endif
+  }
+
+  {
+    py::class_<context_info> cls(m, "context_info");
+    ADD_ATTR(CONTEXT_, REFERENCE_COUNT);
+    ADD_ATTR(CONTEXT_, DEVICES);
+    ADD_ATTR(CONTEXT_, PROPERTIES);
+#if PYOPENCL_CL_VERSION >= 0x1010
+    ADD_ATTR(CONTEXT_, NUM_DEVICES);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(CONTEXT_, INTEROP_USER_SYNC);
+#endif
+  }
+
+  {
+    py::class_<gl_context_info> cls(m, "gl_context_info");
+#if defined(cl_khr_gl_sharing) && (cl_khr_gl_sharing >= 1)
+    ADD_ATTR(, CURRENT_DEVICE_FOR_GL_CONTEXT_KHR);
+    ADD_ATTR(, DEVICES_FOR_GL_CONTEXT_KHR);
+#endif
+  }
+
+  {
+    py::class_<context_properties> cls(m, "context_properties");
+    ADD_ATTR(CONTEXT_, PLATFORM);
+#if defined(cl_khr_gl_sharing) && (cl_khr_gl_sharing >= 1)
+    ADD_ATTR( ,GL_CONTEXT_KHR);
+    ADD_ATTR( ,EGL_DISPLAY_KHR);
+    ADD_ATTR( ,GLX_DISPLAY_KHR);
+    ADD_ATTR( ,WGL_HDC_KHR);
+    ADD_ATTR( ,CGL_SHAREGROUP_KHR);
+#endif
+#if defined(__APPLE__) && defined(HAVE_GL)
+    ADD_ATTR( ,CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE);
+#endif /* __APPLE__ */
+// cl_amd_offline_devices
+#ifdef CL_CONTEXT_OFFLINE_DEVICES_AMD
+    ADD_ATTR(CONTEXT_, OFFLINE_DEVICES_AMD);
+#endif
+  }
+
+  {
+    py::class_<command_queue_info> cls(m, "command_queue_info");
+    ADD_ATTR(QUEUE_, CONTEXT);
+    ADD_ATTR(QUEUE_, DEVICE);
+    ADD_ATTR(QUEUE_, REFERENCE_COUNT);
+    ADD_ATTR(QUEUE_, PROPERTIES);
+  }
+
+  {
+    py::class_<queue_properties> cls(m, "queue_properties");
+#if PYOPENCL_CL_VERSION >= 0x2000
+    ADD_ATTR(QUEUE_, PROPERTIES);
+    ADD_ATTR(QUEUE_, SIZE);
+#endif
+  }
+
+  {
+    py::class_<mem_flags> cls(m, "mem_flags");
+    ADD_ATTR(MEM_, READ_WRITE);
+    ADD_ATTR(MEM_, WRITE_ONLY);
+    ADD_ATTR(MEM_, READ_ONLY);
+    ADD_ATTR(MEM_, USE_HOST_PTR);
+    ADD_ATTR(MEM_, ALLOC_HOST_PTR);
+    ADD_ATTR(MEM_, COPY_HOST_PTR);
+#ifdef cl_amd_device_memory_flags
+    ADD_ATTR(MEM_, USE_PERSISTENT_MEM_AMD);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(MEM_, HOST_WRITE_ONLY);
+    ADD_ATTR(MEM_, HOST_READ_ONLY);
+    ADD_ATTR(MEM_, HOST_NO_ACCESS);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2000
+    ADD_ATTR(MEM_, KERNEL_READ_AND_WRITE);
+#endif
+  }
+
+  {
+    py::class_<svm_mem_flags> cls(m, "svm_mem_flags");
+#if PYOPENCL_CL_VERSION >= 0x2000
+    ADD_ATTR(MEM_, READ_WRITE);
+    ADD_ATTR(MEM_, WRITE_ONLY);
+    ADD_ATTR(MEM_, READ_ONLY);
+    ADD_ATTR(MEM_, SVM_FINE_GRAIN_BUFFER);
+    ADD_ATTR(MEM_, SVM_ATOMICS);
+#endif
+  }
+
+  {
+    py::class_<channel_order> cls(m, "channel_order");
+    ADD_ATTR( , R);
+    ADD_ATTR( , A);
+    ADD_ATTR( , RG);
+    ADD_ATTR( , RA);
+    ADD_ATTR( , RGB);
+    ADD_ATTR( , RGBA);
+    ADD_ATTR( , BGRA);
+    ADD_ATTR( , INTENSITY);
+    ADD_ATTR( , LUMINANCE);
+#if PYOPENCL_CL_VERSION >= 0x1010
+    ADD_ATTR( , Rx);
+    ADD_ATTR( , RGx);
+    ADD_ATTR( , RGBx);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2000
+    ADD_ATTR(  , sRGB);
+    ADD_ATTR(  , sRGBx);
+    ADD_ATTR(  , sRGBA);
+    ADD_ATTR(  , sBGRA);
+    ADD_ATTR(  , ABGR);
+#endif
+  }
+
+  {
+    py::class_<channel_type> cls(m, "channel_type");
+    ADD_ATTR( , SNORM_INT8);
+    ADD_ATTR( , SNORM_INT16);
+    ADD_ATTR( , UNORM_INT8);
+    ADD_ATTR( , UNORM_INT16);
+    ADD_ATTR( , UNORM_SHORT_565);
+    ADD_ATTR( , UNORM_SHORT_555);
+    ADD_ATTR( , UNORM_INT_101010);
+    ADD_ATTR( , SIGNED_INT8);
+    ADD_ATTR( , SIGNED_INT16);
+    ADD_ATTR( , SIGNED_INT32);
+    ADD_ATTR( , UNSIGNED_INT8);
+    ADD_ATTR( , UNSIGNED_INT16);
+    ADD_ATTR( , UNSIGNED_INT32);
+    ADD_ATTR( , HALF_FLOAT);
+    ADD_ATTR( , FLOAT);
+  }
+
+  {
+    py::class_<mem_object_type> cls(m, "mem_object_type");
+    ADD_ATTR(MEM_OBJECT_, BUFFER);
+    ADD_ATTR(MEM_OBJECT_, IMAGE2D);
+    ADD_ATTR(MEM_OBJECT_, IMAGE3D);
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(MEM_OBJECT_, IMAGE2D_ARRAY);
+    ADD_ATTR(MEM_OBJECT_, IMAGE1D);
+    ADD_ATTR(MEM_OBJECT_, IMAGE1D_ARRAY);
+    ADD_ATTR(MEM_OBJECT_, IMAGE1D_BUFFER);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2000
+    ADD_ATTR(MEM_OBJECT_, PIPE);
+#endif
+  }
+
+  {
+    py::class_<mem_info> cls(m, "mem_info");
+    ADD_ATTR(MEM_, TYPE);
+    ADD_ATTR(MEM_, FLAGS);
+    ADD_ATTR(MEM_, SIZE);
+    ADD_ATTR(MEM_, HOST_PTR);
+    ADD_ATTR(MEM_, MAP_COUNT);
+    ADD_ATTR(MEM_, REFERENCE_COUNT);
+    ADD_ATTR(MEM_, CONTEXT);
+#if PYOPENCL_CL_VERSION >= 0x1010
+    ADD_ATTR(MEM_, ASSOCIATED_MEMOBJECT);
+    ADD_ATTR(MEM_, OFFSET);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2000
+    ADD_ATTR(MEM_, USES_SVM_POINTER);
+#endif
+  }
+
+  {
+    py::class_<image_info> cls(m, "image_info");
+    ADD_ATTR(IMAGE_, FORMAT);
+    ADD_ATTR(IMAGE_, ELEMENT_SIZE);
+    ADD_ATTR(IMAGE_, ROW_PITCH);
+    ADD_ATTR(IMAGE_, SLICE_PITCH);
+    ADD_ATTR(IMAGE_, WIDTH);
+    ADD_ATTR(IMAGE_, HEIGHT);
+    ADD_ATTR(IMAGE_, DEPTH);
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(IMAGE_, ARRAY_SIZE);
+    ADD_ATTR(IMAGE_, BUFFER);
+    ADD_ATTR(IMAGE_, NUM_MIP_LEVELS);
+    ADD_ATTR(IMAGE_, NUM_SAMPLES);
+#endif
+  }
+
+  {
+    py::class_<addressing_mode> cls(m, "addressing_mode");
+    ADD_ATTR(ADDRESS_, NONE);
+    ADD_ATTR(ADDRESS_, CLAMP_TO_EDGE);
+    ADD_ATTR(ADDRESS_, CLAMP);
+    ADD_ATTR(ADDRESS_, REPEAT);
+#if PYOPENCL_CL_VERSION >= 0x1010
+    ADD_ATTR(ADDRESS_, MIRRORED_REPEAT);
+#endif
+  }
+
+  {
+    py::class_<filter_mode> cls(m, "filter_mode");
+    ADD_ATTR(FILTER_, NEAREST);
+    ADD_ATTR(FILTER_, LINEAR);
+  }
+
+  {
+    py::class_<sampler_info> cls(m, "sampler_info");
+    ADD_ATTR(SAMPLER_, REFERENCE_COUNT);
+    ADD_ATTR(SAMPLER_, CONTEXT);
+    ADD_ATTR(SAMPLER_, NORMALIZED_COORDS);
+    ADD_ATTR(SAMPLER_, ADDRESSING_MODE);
+    ADD_ATTR(SAMPLER_, FILTER_MODE);
+#if PYOPENCL_CL_VERSION >= 0x2000
+    ADD_ATTR(SAMPLER_, MIP_FILTER_MODE);
+    ADD_ATTR(SAMPLER_, LOD_MIN);
+    ADD_ATTR(SAMPLER_, LOD_MAX);
+#endif
+  }
+
+  {
+    py::class_<map_flags> cls(m, "map_flags");
+    ADD_ATTR(MAP_, READ);
+    ADD_ATTR(MAP_, WRITE);
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(MAP_, WRITE_INVALIDATE_REGION);
+#endif
+  }
+
+  {
+    py::class_<program_info> cls(m, "program_info");
+    ADD_ATTR(PROGRAM_, REFERENCE_COUNT);
+    ADD_ATTR(PROGRAM_, CONTEXT);
+    ADD_ATTR(PROGRAM_, NUM_DEVICES);
+    ADD_ATTR(PROGRAM_, DEVICES);
+    ADD_ATTR(PROGRAM_, SOURCE);
+    ADD_ATTR(PROGRAM_, BINARY_SIZES);
+    ADD_ATTR(PROGRAM_, BINARIES);
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(PROGRAM_, NUM_KERNELS);
+    ADD_ATTR(PROGRAM_, KERNEL_NAMES);
+#endif
+  }
+
+  {
+    py::class_<program_build_info> cls(m, "program_build_info");
+    ADD_ATTR(PROGRAM_BUILD_, STATUS);
+    ADD_ATTR(PROGRAM_BUILD_, OPTIONS);
+    ADD_ATTR(PROGRAM_BUILD_, LOG);
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(PROGRAM_, BINARY_TYPE);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2000
+    ADD_ATTR(PROGRAM_BUILD_, GLOBAL_VARIABLE_TOTAL_SIZE);
+#endif
+  }
+
+  {
+    py::class_<program_binary_type> cls(m, "program_binary_type");
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(PROGRAM_BINARY_TYPE_, NONE);
+    ADD_ATTR(PROGRAM_BINARY_TYPE_, COMPILED_OBJECT);
+    ADD_ATTR(PROGRAM_BINARY_TYPE_, LIBRARY);
+    ADD_ATTR(PROGRAM_BINARY_TYPE_, EXECUTABLE);
+#endif
+  }
+
+  {
+    py::class_<kernel_info> cls(m, "kernel_info");
+    ADD_ATTR(KERNEL_, FUNCTION_NAME);
+    ADD_ATTR(KERNEL_, NUM_ARGS);
+    ADD_ATTR(KERNEL_, REFERENCE_COUNT);
+    ADD_ATTR(KERNEL_, CONTEXT);
+    ADD_ATTR(KERNEL_, PROGRAM);
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(KERNEL_, ATTRIBUTES);
+#endif
+  }
+
+  {
+    py::class_<kernel_arg_info> cls(m, "kernel_arg_info");
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(KERNEL_ARG_, ADDRESS_QUALIFIER);
+    ADD_ATTR(KERNEL_ARG_, ACCESS_QUALIFIER);
+    ADD_ATTR(KERNEL_ARG_, TYPE_NAME);
+    ADD_ATTR(KERNEL_ARG_, TYPE_QUALIFIER);
+    ADD_ATTR(KERNEL_ARG_, NAME);
+#endif
+  }
+
+  {
+    py::class_<kernel_arg_address_qualifier> cls(
+        m, "kernel_arg_address_qualifier");
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(KERNEL_ARG_ADDRESS_, GLOBAL);
+    ADD_ATTR(KERNEL_ARG_ADDRESS_, LOCAL);
+    ADD_ATTR(KERNEL_ARG_ADDRESS_, CONSTANT);
+    ADD_ATTR(KERNEL_ARG_ADDRESS_, PRIVATE);
+#endif
+  }
+
+  {
+    py::class_<kernel_arg_access_qualifier> cls(
+        m, "kernel_arg_access_qualifier");
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(KERNEL_ARG_ACCESS_, READ_ONLY);
+    ADD_ATTR(KERNEL_ARG_ACCESS_, WRITE_ONLY);
+    ADD_ATTR(KERNEL_ARG_ACCESS_, READ_WRITE);
+    ADD_ATTR(KERNEL_ARG_ACCESS_, NONE);
+#endif
+  }
+
+  {
+    py::class_<kernel_arg_type_qualifier> cls(
+        m, "kernel_arg_type_qualifier");
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(KERNEL_ARG_TYPE_, NONE);
+    ADD_ATTR(KERNEL_ARG_TYPE_, CONST);
+    ADD_ATTR(KERNEL_ARG_TYPE_, RESTRICT);
+    ADD_ATTR(KERNEL_ARG_TYPE_, VOLATILE);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2000
+    ADD_ATTR(KERNEL_ARG_TYPE_, PIPE);
+#endif
+  }
+
+  {
+    py::class_<kernel_work_group_info> cls(m, "kernel_work_group_info");
+    ADD_ATTR(KERNEL_, WORK_GROUP_SIZE);
+    ADD_ATTR(KERNEL_, COMPILE_WORK_GROUP_SIZE);
+    ADD_ATTR(KERNEL_, LOCAL_MEM_SIZE);
+#if PYOPENCL_CL_VERSION >= 0x1010
+    ADD_ATTR(KERNEL_, PREFERRED_WORK_GROUP_SIZE_MULTIPLE);
+    ADD_ATTR(KERNEL_, PRIVATE_MEM_SIZE);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(KERNEL_, GLOBAL_WORK_SIZE);
+#endif
+  }
+
+  {
+    py::class_<event_info> cls(m, "event_info");
+    ADD_ATTR(EVENT_, COMMAND_QUEUE);
+    ADD_ATTR(EVENT_, COMMAND_TYPE);
+    ADD_ATTR(EVENT_, REFERENCE_COUNT);
+    ADD_ATTR(EVENT_, COMMAND_EXECUTION_STATUS);
+#if PYOPENCL_CL_VERSION >= 0x1010
+    ADD_ATTR(EVENT_, CONTEXT);
+#endif
+  }
+
+  {
+    py::class_<command_type> cls(m, "command_type");
+    ADD_ATTR(COMMAND_, NDRANGE_KERNEL);
+    ADD_ATTR(COMMAND_, TASK);
+    ADD_ATTR(COMMAND_, NATIVE_KERNEL);
+    ADD_ATTR(COMMAND_, READ_BUFFER);
+    ADD_ATTR(COMMAND_, WRITE_BUFFER);
+    ADD_ATTR(COMMAND_, COPY_BUFFER);
+    ADD_ATTR(COMMAND_, READ_IMAGE);
+    ADD_ATTR(COMMAND_, WRITE_IMAGE);
+    ADD_ATTR(COMMAND_, COPY_IMAGE);
+    ADD_ATTR(COMMAND_, COPY_IMAGE_TO_BUFFER);
+    ADD_ATTR(COMMAND_, COPY_BUFFER_TO_IMAGE);
+    ADD_ATTR(COMMAND_, MAP_BUFFER);
+    ADD_ATTR(COMMAND_, MAP_IMAGE);
+    ADD_ATTR(COMMAND_, UNMAP_MEM_OBJECT);
+    ADD_ATTR(COMMAND_, MARKER);
+    ADD_ATTR(COMMAND_, ACQUIRE_GL_OBJECTS);
+    ADD_ATTR(COMMAND_, RELEASE_GL_OBJECTS);
+#if PYOPENCL_CL_VERSION >= 0x1010
+    ADD_ATTR(COMMAND_, READ_BUFFER_RECT);
+    ADD_ATTR(COMMAND_, WRITE_BUFFER_RECT);
+    ADD_ATTR(COMMAND_, COPY_BUFFER_RECT);
+    ADD_ATTR(COMMAND_, USER);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(COMMAND_, BARRIER);
+    ADD_ATTR(COMMAND_, MIGRATE_MEM_OBJECTS);
+    ADD_ATTR(COMMAND_, FILL_BUFFER);
+    ADD_ATTR(COMMAND_, FILL_IMAGE);
+#endif
+#if PYOPENCL_CL_VERSION >= 0x2000
+    ADD_ATTR(COMMAND_, SVM_FREE);
+    ADD_ATTR(COMMAND_, SVM_MEMCPY);
+    ADD_ATTR(COMMAND_, SVM_MEMFILL);
+    ADD_ATTR(COMMAND_, SVM_MAP);
+    ADD_ATTR(COMMAND_, SVM_UNMAP);
+#endif
+  }
+
+  {
+    py::class_<command_execution_status> cls(m, "command_execution_status");
+    ADD_ATTR(, COMPLETE);
+    ADD_ATTR(, RUNNING);
+    ADD_ATTR(, SUBMITTED);
+    ADD_ATTR(, QUEUED);
+  }
+
+  {
+    py::class_<profiling_info> cls(m, "profiling_info");
+    ADD_ATTR(PROFILING_COMMAND_, QUEUED);
+    ADD_ATTR(PROFILING_COMMAND_, SUBMIT);
+    ADD_ATTR(PROFILING_COMMAND_, START);
+    ADD_ATTR(PROFILING_COMMAND_, END);
+#if PYOPENCL_CL_VERSION >= 0x2000
+    ADD_ATTR(PROFILING_COMMAND_, COMPLETE);
+#endif
+  }
+
+/* not needed--filled in automatically by implementation.
+#if PYOPENCL_CL_VERSION >= 0x1010
+  {
+    py::class_<buffer_create_type> cls(m, "buffer_create_type");
+    ADD_ATTR(BUFFER_CREATE_TYPE_, REGION);
+  }
+#endif
+*/
+
+  {
+    py::class_<mem_migration_flags> cls(
+        m, "mem_migration_flags");
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(MIGRATE_MEM_OBJECT_, HOST);
+    ADD_ATTR(MIGRATE_MEM_OBJECT_, CONTENT_UNDEFINED);
+#endif
+  }
+
+  {
+    py::class_<device_partition_property> cls(
+        m, "device_partition_property");
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(DEVICE_PARTITION_, EQUALLY);
+    ADD_ATTR(DEVICE_PARTITION_, BY_COUNTS);
+    ADD_ATTR(DEVICE_PARTITION_, BY_COUNTS_LIST_END);
+    ADD_ATTR(DEVICE_PARTITION_, BY_AFFINITY_DOMAIN);
+#endif
+  }
+
+  {
+    py::class_<device_affinity_domain> cls(m, "device_affinity_domain");
+#if PYOPENCL_CL_VERSION >= 0x1020
+    ADD_ATTR(DEVICE_AFFINITY_DOMAIN_, NUMA);
+    ADD_ATTR(DEVICE_AFFINITY_DOMAIN_, L4_CACHE);
+    ADD_ATTR(DEVICE_AFFINITY_DOMAIN_, L3_CACHE);
+    ADD_ATTR(DEVICE_AFFINITY_DOMAIN_, L2_CACHE);
+    ADD_ATTR(DEVICE_AFFINITY_DOMAIN_, L1_CACHE);
+    ADD_ATTR(DEVICE_AFFINITY_DOMAIN_, NEXT_PARTITIONABLE);
+#endif
+  }
+
+#ifdef HAVE_GL
+  {
+    py::class_<gl_object_type> cls(m, "gl_object_type");
+    ADD_ATTR(GL_OBJECT_, BUFFER);
+    ADD_ATTR(GL_OBJECT_, TEXTURE2D);
+    ADD_ATTR(GL_OBJECT_, TEXTURE3D);
+    ADD_ATTR(GL_OBJECT_, RENDERBUFFER);
+  }
+
+  {
+    py::class_<gl_texture_info> cls(m, "gl_texture_info");
+    ADD_ATTR(GL_, TEXTURE_TARGET);
+    ADD_ATTR(GL_, MIPMAP_LEVEL);
+  }
+#endif
+
+  // }}}
+}
+
+
+
+
+// vim: foldmethod=marker
diff --git a/src/wrap_helpers.hpp b/src/wrap_helpers.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4a2d1ee99e8fd044897e9791680ffc1a5c139222
--- /dev/null
+++ b/src/wrap_helpers.hpp
@@ -0,0 +1,163 @@
+#ifndef PYCUDA_WRAP_HELPERS_HEADER_SEEN
+#define PYCUDA_WRAP_HELPERS_HEADER_SEEN
+
+
+#include <pybind11/pybind11.h>
+#include <pybind11/operators.h>
+
+
+namespace py = pybind11;
+
+
+#define PYTHON_ERROR(TYPE, REASON) \
+{ \
+  PyErr_SetString(PyExc_##TYPE, REASON); \
+  throw boost::python::error_already_set(); \
+}
+
+#define ENUM_VALUE(NAME) \
+  value(#NAME, NAME)
+
+#define DEF_SIMPLE_METHOD(NAME) \
+  def(#NAME, &cls::NAME)
+
+#define DEF_SIMPLE_STATIC_METHOD(NAME) \
+  def_static(#NAME, &cls::NAME)
+
+#define DEF_SIMPLE_METHOD_WITH_ARGS(NAME, ARGS) \
+  def(#NAME, &cls::NAME, boost::python::args ARGS)
+
+#define DEF_SIMPLE_FUNCTION(NAME) \
+  m.def(#NAME, &NAME)
+
+#define DEF_SIMPLE_FUNCTION_WITH_ARGS(NAME, ARGS) \
+  m.def(#NAME, &NAME, py::args ARGS)
+
+#define DEF_SIMPLE_RO_MEMBER(NAME) \
+  def_readonly(#NAME, &cls::m_##NAME)
+
+#define DEF_SIMPLE_RW_MEMBER(NAME) \
+  def_readwrite(#NAME, &cls::m_##NAME)
+
+#define COPY_PY_LIST(TYPE, NAME) \
+  { \
+    for (auto it: py_##NAME) \
+      NAME.push_back(it.cast<TYPE>()); \
+  }
+
+#define COPY_PY_COORD_TRIPLE(NAME) \
+  size_t NAME[3] = {0, 0, 0}; \
+  { \
+    py::tuple py_tup_##NAME = py_##NAME; \
+    size_t my_len = len(py_tup_##NAME); \
+    if (my_len > 3) \
+      throw error("transfer", CL_INVALID_VALUE, #NAME "has too many components"); \
+    for (size_t i = 0; i < my_len; ++i) \
+      NAME[i] = py_tup_##NAME[i].cast<size_t>(); \
+  }
+
+#define COPY_PY_PITCH_TUPLE(NAME) \
+  size_t NAME[2] = {0, 0}; \
+  if (py_##NAME.ptr() != Py_None) \
+  { \
+    py::tuple py_tup_##NAME = py_##NAME; \
+    size_t my_len = len(py_tup_##NAME); \
+    if (my_len > 2) \
+      throw error("transfer", CL_INVALID_VALUE, #NAME "has too many components"); \
+    for (size_t i = 0; i < my_len; ++i) \
+      NAME[i] = py_tup_##NAME[i].cast<size_t>(); \
+  }
+
+#define COPY_PY_REGION_TRIPLE(NAME) \
+  size_t NAME[3] = {1, 1, 1}; \
+  { \
+    py::tuple py_tup_##NAME = py_##NAME; \
+    size_t my_len = len(py_tup_##NAME); \
+    if (my_len > 3) \
+      throw error("transfer", CL_INVALID_VALUE, #NAME "has too many components"); \
+    for (size_t i = 0; i < my_len; ++i) \
+      NAME[i] = py_tup_##NAME[i].cast<size_t>(); \
+  }
+
+#define PYOPENCL_PARSE_NUMPY_ARRAY_SPEC \
+    PyArray_Descr *tp_descr; \
+    if (PyArray_DescrConverter(dtype.ptr(), &tp_descr) != NPY_SUCCEED) \
+      throw py::error_already_set(); \
+    \
+    std::vector<npy_intp> shape; \
+    try \
+    { \
+      shape.push_back(py_shape.cast<npy_intp>()); \
+    } \
+    catch (py::cast_error &) \
+    { \
+      COPY_PY_LIST(npy_intp, shape); \
+    } \
+    \
+    NPY_ORDER order = PyArray_CORDER; \
+    PyArray_OrderConverter(py_order.ptr(), &order); \
+    \
+    int ary_flags = 0; \
+    if (order == PyArray_FORTRANORDER) \
+      ary_flags |= NPY_FARRAY; \
+    else if (order == PyArray_CORDER) \
+      ary_flags |= NPY_CARRAY; \
+    else \
+      throw std::runtime_error("unrecognized order specifier"); \
+    \
+    std::vector<npy_intp> strides; \
+    if (py_strides.ptr() != Py_None) \
+    { \
+      COPY_PY_LIST(npy_intp, strides); \
+    }
+
+#define PYOPENCL_RETURN_VECTOR(ITEMTYPE, NAME) \
+  { \
+    py::list pyopencl_result; \
+    for (ITEMTYPE item: NAME) \
+      pyopencl_result.append(item); \
+    return pyopencl_result; \
+  }
+
+namespace
+{
+  template <typename T>
+  inline py::object handle_from_new_ptr(T *ptr)
+  {
+    return py::cast(ptr, py::return_value_policy::take_ownership);
+  }
+
+  template <typename T, typename ClType>
+  inline T *from_int_ptr(intptr_t obj_ref, bool retain)
+  {
+    ClType clobj = (ClType) obj_ref;
+    return new T(clobj, retain);
+  }
+
+  template <typename T>
+  inline intptr_t to_int_ptr(T const &obj)
+  {
+    return (intptr_t) obj.data();
+  }
+}
+
+#define PYOPENCL_EXPOSE_TO_FROM_INT_PTR(CL_TYPENAME) \
+  .def_static("from_int_ptr", from_int_ptr<cls, CL_TYPENAME>, \
+      py::arg("int_ptr_value"), \
+      py::arg("retain")=true, \
+      "(static method) Return a new Python object referencing the C-level " \
+      ":c:type:`" #CL_TYPENAME "` object at the location pointed to " \
+      "by *int_ptr_value*. The relevant :c:func:`clRetain*` function " \
+      "will be called if *retain* is True." \
+      "If the previous owner of the object will *not* release the reference, " \
+      "*retain* should be set to *False*, to effectively transfer ownership to " \
+      ":mod:`pyopencl`." \
+      "\n\n.. versionadded:: 2013.2\n" \
+      "\n\n.. versionchanged:: 2016.1\n\n    *retain* added.") \
+  .def_property_readonly("int_ptr", to_int_ptr<cls>, \
+      "Return an integer corresponding to the pointer value " \
+      "of the underlying :c:type:`" #CL_TYPENAME "`. " \
+      "Use :meth:`from_int_ptr` to turn back into a Python object." \
+      "\n\n.. versionadded:: 2013.2\n") \
+
+#endif
diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a15efeb64c848c4e5e3928c98a5fb9186749e12b
--- /dev/null
+++ b/src/wrap_mempool.cpp
@@ -0,0 +1,288 @@
+// Gregor Thalhammer (on Apr 13, 2011) said it's necessary to import Python.h 
+// first to prevent OS X from overriding a bunch of macros. (e.g. isspace)
+#include <Python.h>
+
+#include <memory>
+#include <vector>
+#include "wrap_helpers.hpp"
+#include "wrap_cl.hpp"
+#include "mempool.hpp"
+#include "tools.hpp"
+
+
+
+namespace
+{
+  class cl_allocator_base
+  {
+    protected:
+      std::shared_ptr<pyopencl::context> m_context;
+      cl_mem_flags m_flags;
+
+    public:
+      cl_allocator_base(std::shared_ptr<pyopencl::context> const &ctx,
+          cl_mem_flags flags=CL_MEM_READ_WRITE)
+        : m_context(ctx), m_flags(flags)
+      {
+        if (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR))
+          throw pyopencl::error("Allocator", CL_INVALID_VALUE,
+              "cannot specify USE_HOST_PTR or COPY_HOST_PTR flags");
+      }
+
+      cl_allocator_base(cl_allocator_base const &src)
+      : m_context(src.m_context), m_flags(src.m_flags)
+      { }
+
+      virtual ~cl_allocator_base()
+      { }
+
+      typedef cl_mem pointer_type;
+      typedef size_t size_type;
+
+      virtual cl_allocator_base *copy() const = 0;
+      virtual bool is_deferred() const = 0;
+      virtual pointer_type allocate(size_type s) = 0;
+
+      void free(pointer_type p)
+      {
+        PYOPENCL_CALL_GUARDED(clReleaseMemObject, (p));
+      }
+
+      void try_release_blocks()
+      {
+        pyopencl::run_python_gc();
+      }
+  };
+
+  class cl_deferred_allocator : public cl_allocator_base
+  {
+    private:
+      typedef cl_allocator_base super;
+
+    public:
+      cl_deferred_allocator(std::shared_ptr<pyopencl::context> const &ctx,
+          cl_mem_flags flags=CL_MEM_READ_WRITE)
+        : super(ctx, flags)
+      { }
+
+      cl_allocator_base *copy() const
+      {
+        return new cl_deferred_allocator(*this);
+      }
+
+      bool is_deferred() const
+      { return true; }
+
+      pointer_type allocate(size_type s)
+      {
+        return pyopencl::create_buffer(m_context->data(), m_flags, s, 0);
+      }
+  };
+
+  const unsigned zero = 0;
+
+  class cl_immediate_allocator : public cl_allocator_base
+  {
+    private:
+      typedef cl_allocator_base super;
+      pyopencl::command_queue m_queue;
+
+    public:
+      cl_immediate_allocator(pyopencl::command_queue &queue,
+          cl_mem_flags flags=CL_MEM_READ_WRITE)
+        : super(std::shared_ptr<pyopencl::context>(queue.get_context()), flags),
+        m_queue(queue.data(), /*retain*/ true)
+      { }
+
+      cl_immediate_allocator(cl_immediate_allocator const &src)
+        : super(src), m_queue(src.m_queue)
+      { }
+
+      cl_allocator_base *copy() const
+      {
+        return new cl_immediate_allocator(*this);
+      }
+
+      bool is_deferred() const
+      { return false; }
+
+      pointer_type allocate(size_type s)
+      {
+        pointer_type ptr =  pyopencl::create_buffer(
+            m_context->data(), m_flags, s, 0);
+
+        // Make sure the buffer gets allocated right here and right now.
+        // This looks (and is) expensive. But immediate allocators
+        // have their main use in memory pools, whose basic assumption
+        // is that allocation is too expensive anyway--but they rely
+        // on exact 'out-of-memory' information.
+        unsigned zero = 0;
+        PYOPENCL_CALL_GUARDED(clEnqueueWriteBuffer, (
+              m_queue.data(),
+              ptr,
+              /* is blocking */ CL_FALSE,
+              0, std::min(s, sizeof(zero)), &zero,
+              0, NULL, NULL
+              ));
+
+        // No need to wait for completion here. clWaitForEvents (e.g.)
+        // cannot return mem object allocation failures. This implies that
+        // the buffer is faulted onto the device on enqueue.
+
+        return ptr;
+      }
+  };
+
+
+
+
+  inline
+  pyopencl::buffer *allocator_call(cl_allocator_base &alloc, size_t size)
+  {
+    cl_mem mem;
+    int try_count = 0;
+    while (try_count < 2)
+    {
+      try
+      {
+        mem = alloc.allocate(size);
+        break;
+      }
+      catch (pyopencl::error &e)
+      {
+        if (!e.is_out_of_memory())
+          throw;
+        if (++try_count == 2)
+          throw;
+      }
+
+      alloc.try_release_blocks();
+    }
+
+    try
+    {
+      return new pyopencl::buffer(mem, false);
+    }
+    catch (...)
+    {
+      PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem));
+      throw;
+    }
+  }
+
+
+
+
+  class pooled_buffer
+    : public pyopencl::pooled_allocation<pyopencl::memory_pool<cl_allocator_base> >,
+    public pyopencl::memory_object_holder
+  {
+    private:
+      typedef
+        pyopencl::pooled_allocation<pyopencl::memory_pool<cl_allocator_base> >
+        super;
+
+    public:
+      pooled_buffer(
+          std::shared_ptr<super::pool_type> p, super::size_type s)
+        : super(p, s)
+      { }
+
+      const super::pointer_type data() const
+      { return ptr(); }
+  };
+
+
+
+
+  pooled_buffer *device_pool_allocate(
+      std::shared_ptr<pyopencl::memory_pool<cl_allocator_base> > pool,
+      pyopencl::memory_pool<cl_allocator_base>::size_type sz)
+  {
+    return new pooled_buffer(pool, sz);
+  }
+
+
+
+
+  template<class Wrapper>
+  void expose_memory_pool(Wrapper &wrapper)
+  {
+    typedef typename Wrapper::type cls;
+    wrapper
+      .def_property_readonly("held_blocks", &cls::held_blocks)
+      .def_property_readonly("active_blocks", &cls::active_blocks)
+      .DEF_SIMPLE_STATIC_METHOD(bin_number)
+      .DEF_SIMPLE_STATIC_METHOD(alloc_size)
+      .DEF_SIMPLE_METHOD(free_held)
+      .DEF_SIMPLE_METHOD(stop_holding)
+      ;
+  }
+}
+
+
+
+
+void pyopencl_expose_mempool(py::module &m)
+{
+  m.def("bitlog2", pyopencl::bitlog2);
+
+  {
+    typedef cl_allocator_base cls;
+    py::class_<cls /*, boost::noncopyable */> wrapper(
+        m, "_tools_AllocatorBase"/*, py::no_init */);
+    wrapper
+      .def("__call__", allocator_call)
+      ;
+
+  }
+
+  {
+    typedef cl_deferred_allocator cls;
+    py::class_<cls, cl_allocator_base> wrapper(
+        m, "_tools_DeferredAllocator");
+    wrapper
+      .def(py::init<
+          std::shared_ptr<pyopencl::context> const &>())
+      .def(py::init<
+          std::shared_ptr<pyopencl::context> const &,
+          cl_mem_flags>())
+      ;
+  }
+
+  {
+    typedef cl_immediate_allocator cls;
+    py::class_<cls, cl_allocator_base> wrapper(
+        m, "_tools_ImmediateAllocator");
+    wrapper
+      .def(py::init<pyopencl::command_queue &>())
+      .def(py::init<pyopencl::command_queue &, cl_mem_flags>())
+      ;
+  }
+
+  {
+    typedef pyopencl::memory_pool<cl_allocator_base> cls;
+
+    py::class_<
+      cls, /* boost::noncopyable, */
+      std::shared_ptr<cls>> wrapper( m, "MemoryPool");
+    wrapper
+      .def(py::init<cl_allocator_base const &>())
+      .def("allocate", device_pool_allocate)
+      .def("__call__", device_pool_allocate)
+      // undoc for now
+      .DEF_SIMPLE_METHOD(set_trace)
+      ;
+
+    expose_memory_pool(wrapper);
+  }
+
+  {
+    typedef pooled_buffer cls;
+    py::class_<cls, /* boost::noncopyable, */
+      pyopencl::memory_object_holder>(
+          m, "PooledBuffer"/* , py::no_init */)
+      .def("release", &cls::free)
+      ;
+  }
+}
diff --git a/test/test_array.py b/test/test_array.py
index 05008c169ae782a49b5b985c7a79780e337c5770..3e74bcf0e2bc3c5c56ebfbb971164d89fcc49a35 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -37,10 +37,11 @@ import pyopencl.tools as cl_tools
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl as pytest_generate_tests)
 from pyopencl.characterize import has_double_support, has_struct_arg_count_bug
-from pyopencl.cffi_cl import _PYPY
 
 from pyopencl.clrandom import RanluxGenerator, PhiloxGenerator, ThreefryGenerator
 
+_PYPY = cl._PYPY
+
 
 # {{{ helpers
 
@@ -580,7 +581,7 @@ def test_bitwise(ctx_factory):
 
 @pytest.mark.parametrize("rng_class",
         [RanluxGenerator, PhiloxGenerator, ThreefryGenerator])
-@pytest.mark.parametrize("ary_size", [300, 301, 302, 303, 10007])
+@pytest.mark.parametrize("ary_size", [300, 301, 302, 303, 10007, 1000000])
 def test_random_float_in_range(ctx_factory, rng_class, ary_size, plot_hist=False):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -605,16 +606,22 @@ def test_random_float_in_range(ctx_factory, rng_class, ary_size, plot_hist=False
             pt.hist(ran.get(), 30)
             pt.show()
 
-        assert (0 < ran.get()).all()
-        assert (ran.get() < 1).all()
+        assert (0 <= ran.get()).all()
+        assert (ran.get() <= 1).all()
 
         if rng_class is RanluxGenerator:
             gen.synchronize(queue)
 
         ran = cl_array.zeros(queue, ary_size, dtype)
         gen.fill_uniform(ran, a=4, b=7)
-        assert (4 < ran.get()).all()
-        assert (ran.get() < 7).all()
+        ran_host = ran.get()
+
+        for cond in [4 <= ran_host,  ran_host <= 7]:
+            good = cond.all()
+            if not good:
+                print(np.where(~cond))
+                print(ran_host[~cond])
+            assert good
 
         ran = gen.normal(queue, ary_size, dtype, mu=10, sigma=3)
 
diff --git a/test/test_wrapper.py b/test/test_wrapper.py
index a17866fa77110f4d1b232898ca46c76f54ec4a83..4d729642163b0bebc63f2bd356a6e7ff00868ab7 100644
--- a/test/test_wrapper.py
+++ b/test/test_wrapper.py
@@ -295,7 +295,9 @@ def test_image_format_constructor():
 
     assert iform.channel_order == cl.channel_order.RGBA
     assert iform.channel_data_type == cl.channel_type.FLOAT
-    assert not iform.__dict__
+
+    if not cl._PYPY:
+        assert not hasattr(iform, "__dict__")
 
 
 def test_device_topology_amd_constructor():
@@ -306,7 +308,8 @@ def test_device_topology_amd_constructor():
     assert topol.device == 4
     assert topol.function == 5
 
-    assert not topol.__dict__
+    if not cl._PYPY:
+        assert not hasattr(topol, "__dict__")
 
 
 def test_nonempty_supported_image_formats(ctx_factory):
@@ -351,7 +354,7 @@ def test_that_python_args_fail(ctx_factory):
     prg.mult(queue, a.shape, None, a_buf, np.float32(2), np.int32(3))
 
     a_result = np.empty_like(a)
-    cl.enqueue_read_buffer(queue, a_buf, a_result).wait()
+    cl.enqueue_copy(queue, a_buf, a_result).wait()
 
 
 def test_image_2d(ctx_factory):
@@ -513,8 +516,8 @@ def test_copy_buffer(ctx_factory):
     buf1 = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
     buf2 = cl.Buffer(context, mf.WRITE_ONLY, b.nbytes)
 
-    cl.enqueue_copy_buffer(queue, buf1, buf2).wait()
-    cl.enqueue_read_buffer(queue, buf2, b).wait()
+    cl.enqueue_copy(queue, buf2, buf1).wait()
+    cl.enqueue_copy(queue, b, buf2).wait()
 
     assert la.norm(a - b) == 0
 
@@ -569,7 +572,7 @@ def test_vector_args(ctx_factory):
 
     prg.set_vec(queue, dest.shape, None, x, dest_buf)
 
-    cl.enqueue_read_buffer(queue, dest_buf, dest).wait()
+    cl.enqueue_copy(queue, dest, dest_buf).wait()
 
     assert (dest == x).all()
 
@@ -665,36 +668,6 @@ def test_unload_compiler(platform):
     cl.unload_platform_compiler(platform)
 
 
-def test_enqueue_task(ctx_factory):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-    mf = cl.mem_flags
-
-    prg = cl.Program(ctx, """
-    __kernel void
-    reverse(__global const float *in, __global float *out, int n)
-    {
-        for (int i = 0;i < n;i++) {
-            out[i] = in[n - 1 - i];
-        }
-    }
-    """).build()
-    knl = prg.reverse
-
-    n = 100
-    a = np.random.rand(n).astype(np.float32)
-    b = np.empty_like(a)
-
-    buf1 = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
-    buf2 = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
-
-    knl.set_args(buf1, buf2, np.int32(n))
-    cl.enqueue_task(queue, knl)
-
-    cl.enqueue_copy(queue, b, buf2).wait()
-    assert la.norm(a[::-1] - b) == 0
-
-
 def test_platform_get_devices(ctx_factory):
     ctx = ctx_factory()
     platform = ctx.devices[0].platform
@@ -768,6 +741,10 @@ def test_user_event(ctx_factory):
 
 
 def test_buffer_get_host_array(ctx_factory):
+    if cl._PYPY:
+        # FIXME
+        pytest.xfail("Buffer.get_host_array not yet working on pypy")
+
     ctx = ctx_factory()
     mf = cl.mem_flags
 
@@ -823,7 +800,7 @@ def test_event_set_callback(ctx_factory):
     queue = cl.CommandQueue(ctx)
 
     if ctx._get_cl_version() < (1, 1):
-        pytest.skip("OpenCL 1.1 or newer required fro set_callback")
+        pytest.skip("OpenCL 1.1 or newer required for set_callback")
 
     a_np = np.random.rand(50000).astype(np.float32)
     b_np = np.random.rand(50000).astype(np.float32)
@@ -857,9 +834,17 @@ def test_event_set_callback(ctx_factory):
 
     queue.finish()
 
+    counter = 0
+
     # yuck
-    from time import sleep
-    sleep(0.1)
+    while not got_called:
+        from time import sleep
+        sleep(0.01)
+
+        # wait up to five seconds (?!)
+        counter += 1
+        if counter >= 500:
+            break
 
     assert got_called
 
@@ -952,18 +937,10 @@ def test_coarse_grain_svm(ctx_factory):
 
     dev = ctx.devices[0]
 
-    has_svm = (ctx._get_cl_version() >= (2, 0) and
-                ctx.devices[0]._get_cl_version() >= (2, 0) and
-                cl.get_cl_header_version() >= (2, 0))
-
-    if dev.platform.name == "Portable Computing Language":
-        has_svm = (
-                get_pocl_version(dev.platform) >= (1, 0)
-                and cl.get_cl_header_version() >= (2, 0))
-
-    if not has_svm:
-        from pytest import skip
-        skip("SVM only available in OpenCL 2.0 and higher")
+    from pyopencl.characterize import has_coarse_grain_buffer_svm
+    from pytest import skip
+    if not has_coarse_grain_buffer_svm(queue.device):
+        skip("device does not support coarse-grain SVM")
 
     if ("AMD" in dev.platform.name
             and dev.type & cl.device_type.CPU):
@@ -1012,13 +989,9 @@ def test_fine_grain_svm(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
+    from pyopencl.characterize import has_fine_grain_buffer_svm
     from pytest import skip
-    if (ctx._get_cl_version() < (2, 0) or
-            cl.get_cl_header_version() < (2, 0)):
-        skip("SVM only available in OpenCL 2.0 and higher")
-
-    if not (ctx.devices[0].svm_capabilities
-            & cl.device_svm_capabilities.FINE_GRAIN_BUFFER):
+    if not has_fine_grain_buffer_svm(queue.device):
         skip("device does not support fine-grain SVM")
 
     n = 3000
@@ -1050,6 +1023,10 @@ def test_fine_grain_svm(ctx_factory):
     cl.cltypes.uint2,
     ])
 def test_map_dtype(ctx_factory, dtype):
+    if cl._PYPY:
+        # FIXME
+        pytest.xfail("enqueue_map_buffer not yet working on pypy")
+
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)