diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 85791809df3f97324a616f4a16de51fe330ea145..628443b955c9843f4246f88401a93a1c0ec955b9 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -23,9 +23,9 @@ THE SOFTWARE.
 """
 
 from pyopencl.version import VERSION, VERSION_STATUS, VERSION_TEXT  # noqa
-
 try:
-    import pyopencl._cl as _cl
+    import pyopencl.cffi_cl as _cl
+    #import pyopencl._cl as _cl
 except ImportError:
     import os
     from os.path import dirname, join, realpath
@@ -35,9 +35,13 @@ except ImportError:
                 "its source directory. This likely won't work.")
     raise
 
-
+# _ccl = _cl
+# import cffi_cl
+# _cl = cffi_cl
+    
 import numpy as np
-from pyopencl._cl import *  # noqa
+#from pyopencl._cl import *  # noqa
+from pyopencl.cffi_cl import *
 import inspect as _inspect
 
 CONSTANT_CLASSES = [
@@ -45,7 +49,6 @@ CONSTANT_CLASSES = [
         if _inspect.isclass(getattr(_cl, name))
         and name[0].islower()]
 
-
 class CompilerWarning(UserWarning):
     pass
 
@@ -235,29 +238,32 @@ def link_program(context, programs, options=[], devices=None):
 
 # }}}
 
-
 def _add_functionality():
     cls_to_info_cls = {
-            _cl.Platform:
-                (_cl.Platform.get_info, _cl.platform_info),
-            _cl.Device:
-                (_cl.Device.get_info, _cl.device_info),
-            _cl.Context:
-                (_cl.Context.get_info, _cl.context_info),
-            _cl.CommandQueue:
-                (_cl.CommandQueue.get_info, _cl.command_queue_info),
-            _cl.Event:
-                (_cl.Event.get_info, _cl.event_info),
-            _cl.MemoryObjectHolder:
-                (MemoryObjectHolder.get_info, _cl.mem_info),
+            # PlatformZ:
+            #     (PlatformZ.get_info, platform_info),
+            # DeviceZ:
+            #     (DeviceZ.get_info, device_info),
+            Platform:
+                (Platform.get_info, platform_info),
+            Device:
+                (Device.get_info, device_info),
+            Context:
+                (Context.get_info, context_info),
+            CommandQueue:
+                (CommandQueue.get_info, command_queue_info),
+            Event:
+                (Event.get_info, event_info),
+            MemoryObjectHolder:
+                (MemoryObjectHolder.get_info, mem_info),
             Image:
-                (_cl.Image.get_image_info, _cl.image_info),
+                (Image.get_image_info, image_info),
             Program:
-                (Program.get_info, _cl.program_info),
+                (Program.get_info, program_info),
             Kernel:
-                (Kernel.get_info, _cl.kernel_info),
-            _cl.Sampler:
-                (Sampler.get_info, _cl.sampler_info),
+                (Kernel.get_info, kernel_info),
+            Sampler:
+                (Sampler.get_info, sampler_info),
             }
 
     def to_string(cls, value, default_format=None):
@@ -286,16 +292,16 @@ def _add_functionality():
         for info_name, info_value in info_class.__dict__.iteritems():
             if info_name == "to_string" or info_name.startswith("_"):
                 continue
-
             setattr(cls, info_name.lower(), make_getinfo(
                     info_method, getattr(info_class, info_name)))
-
     # }}}
 
     # {{{ Platform
 
     def platform_repr(self):
-        return "<pyopencl.Platform '%s' at 0x%x>" % (self.name, self.int_ptr)
+        return "<pyopencl.Platform '%s' at TODO>" % (self.name)
+        # TODO int_ptr
+        #return "<pyopencl.Platform '%s' at 0x%x>" % (self.name, self.int_ptr)
 
     Platform.__repr__ = platform_repr
 
@@ -304,8 +310,11 @@ def _add_functionality():
     # {{{ Device
 
     def device_repr(self):
-        return "<pyopencl.Device '%s' on '%s' at 0x%x>" % (
-                self.name.strip(), self.platform.name.strip(), self.int_ptr)
+        return "<pyopencl.Device '%s' on TODO at TODO>" % (
+                self.name.strip())
+        # TODO
+        # return "<pyopencl.Device '%s' on '%s' at 0x%x>" % (
+        #         self.name.strip(), self.platform.name.strip(), self.int_ptr)
 
     Device.__repr__ = device_repr
 
diff --git a/pyopencl/cffi_cl.py b/pyopencl/cffi_cl.py
new file mode 100644
index 0000000000000000000000000000000000000000..59ae52b588774f11e7365aa32fe720e0253cb50c
--- /dev/null
+++ b/pyopencl/cffi_cl.py
@@ -0,0 +1,493 @@
+
+from pyopencl._cl import device_info, context_info, command_queue_info, Event, event_info, mem_info, Image, image_info, program_info, Kernel, ImageFormat, GLBuffer, kernel_info, sampler_info, Sampler, have_gl, _enqueue_read_image, _enqueue_write_image, GLTexture, channel_type, _enqueue_copy_image, _enqueue_copy_image_to_buffer, _enqueue_copy_buffer_to_image, _enqueue_write_buffer, _enqueue_copy_buffer, get_cl_header_version, _enqueue_read_buffer_rect, _enqueue_write_buffer_rect, _enqueue_copy_buffer_rect, RuntimeError, program_kind, mem_object_type, Error, platform_info, device_type, mem_flags, LogicError
+
+import warnings
+
+import os.path
+current_directory = os.path.dirname(__file__)
+
+
+
+
+from cffi import FFI
+_ffi = FFI()
+_cl_header = """
+
+/* cl.h */
+/* scalar types */
+typedef int8_t		cl_char;
+typedef uint8_t		cl_uchar;
+typedef int16_t		cl_short;
+typedef uint16_t	cl_ushort;
+typedef int32_t		cl_int;
+typedef uint32_t	cl_uint;
+typedef int64_t		cl_long;
+typedef uint64_t	cl_ulong;
+
+typedef uint16_t        cl_half;
+typedef float                   cl_float;
+typedef double                  cl_double;
+
+
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+typedef struct _cl_context *        cl_context;
+typedef struct _cl_command_queue *  cl_command_queue;
+typedef struct _cl_mem *            cl_mem;
+typedef struct _cl_program *        cl_program;
+typedef struct _cl_kernel *         cl_kernel;
+typedef struct _cl_event *          cl_event;
+typedef struct _cl_sampler *        cl_sampler;
+
+typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ 
+typedef cl_ulong            cl_bitfield;
+typedef cl_bitfield         cl_device_type;
+typedef cl_uint             cl_platform_info;
+typedef cl_uint             cl_device_info;
+typedef cl_bitfield         cl_device_fp_config;
+typedef cl_uint             cl_device_mem_cache_type;
+typedef cl_uint             cl_device_local_mem_type;
+typedef cl_bitfield         cl_device_exec_capabilities;
+typedef cl_bitfield         cl_command_queue_properties;
+typedef intptr_t            cl_device_partition_property;
+typedef cl_bitfield         cl_device_affinity_domain;
+
+typedef intptr_t            cl_context_properties;
+typedef cl_uint             cl_context_info;
+typedef cl_uint             cl_command_queue_info;
+typedef cl_uint             cl_channel_order;
+typedef cl_uint             cl_channel_type;
+typedef cl_bitfield         cl_mem_flags;
+typedef cl_uint             cl_mem_object_type;
+typedef cl_uint             cl_mem_info;
+typedef cl_bitfield         cl_mem_migration_flags;
+typedef cl_uint             cl_image_info;
+typedef cl_uint             cl_buffer_create_type;
+typedef cl_uint             cl_addressing_mode;
+typedef cl_uint             cl_filter_mode;
+typedef cl_uint             cl_sampler_info;
+typedef cl_bitfield         cl_map_flags;
+typedef cl_uint             cl_program_info;
+typedef cl_uint             cl_program_build_info;
+typedef cl_uint             cl_program_binary_type;
+typedef cl_int              cl_build_status;
+typedef cl_uint             cl_kernel_info;
+typedef cl_uint             cl_kernel_arg_info;
+typedef cl_uint             cl_kernel_arg_address_qualifier;
+typedef cl_uint             cl_kernel_arg_access_qualifier;
+typedef cl_bitfield         cl_kernel_arg_type_qualifier;
+typedef cl_uint             cl_kernel_work_group_info;
+typedef cl_uint             cl_event_info;
+typedef cl_uint             cl_command_type;
+typedef cl_uint             cl_profiling_info;
+
+/* cl_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE  ...
+#define CL_MEM_WRITE_ONLY  ...
+#define CL_MEM_READ_ONLY  ...
+#define CL_MEM_USE_HOST_PTR  ...
+#define CL_MEM_ALLOC_HOST_PTR  ...
+#define CL_MEM_COPY_HOST_PTR  ...
+#define CL_MEM_HOST_WRITE_ONLY  ...
+#define CL_MEM_HOST_READ_ONLY  ...
+#define CL_MEM_HOST_NO_ACCESS  ...
+
+
+"""
+
+with open(os.path.join(current_directory, 'wrap_cl_core.h')) as _f:
+    _wrap_cl_header = _f.read()
+
+_ffi.cdef('%s\n%s' % (_cl_header, _wrap_cl_header))
+print current_directory
+_lib = _ffi.verify(
+    """
+    #include <wrap_cl.h>
+    """,
+    include_dirs=[os.path.join(current_directory, "../src/c_wrapper/")],
+    library_dirs=[current_directory],
+    libraries=["wrapcl", "OpenCL"])
+
+class PP(object):
+    def __init__(self, ptr):
+        self.ptr = ptr
+        self.size = _ffi.new('uint32_t *')
+
+    def __getitem__(self, key):
+        return self.ptr[0].__getitem__(key)
+
+    def __iter__(self):
+        for i in xrange(self.size[0]):
+            yield self[i]
+
+    def __del__(self):
+        _lib.freem(self.ptr[0])
+
+class CLRuntimeError(RuntimeError):
+    def __init__(self, routine, code, msg=""):
+        super(CLRuntimeError, self).__init__(msg)
+        self.routine = routine
+        self.code = code
+
+        
+# plats = _ffi.new("void**")
+# print _lib.get_platforms(plats)
+# print _ffi.cast('int**', plats)[0][1]
+# exit()
+
+
+
+# _platform_info_constants = {
+#     'PROFILE': _lib.CL_PLATFORM_PROFILE,
+#     'VERSION': _lib.CL_PLATFORM_VERSION,
+#     'NAME': _lib.CL_PLATFORM_NAME,
+#     'VENDOR': _lib.CL_PLATFORM_VENDOR,
+# }
+
+# # re-implementation
+# platform_info = type("platform_info", (NOINIT,), _platform_info_constants)
+
+# _device_type_constants = {
+#     'DEFAULT': _lib.CL_DEVICE_TYPE_DEFAULT,
+#     'CPU': _lib.CL_DEVICE_TYPE_CPU,
+#     'GPU': _lib.CL_DEVICE_TYPE_GPU,
+#     'ACCELERATOR': _lib.CL_DEVICE_TYPE_ACCELERATOR,
+#     'ALL': _lib.CL_DEVICE_TYPE_ALL,
+#     }
+# if _CL_VERSION >= (1, 2):
+#     _device_type_constants['CUSTOM'] = _lib.CL_DEVICE_TYPE_CUSTOM
+
+# # re-implementation
+# device_type = type("device_type", (NOINIT,), _device_type_constants)
+
+# _mem_flags_constants = {
+#     'READ_WRITE': _lib.CL_MEM_READ_WRITE, 
+#     'WRITE_ONLY': _lib.CL_MEM_WRITE_ONLY, 
+#     'READ_ONLY': _lib.CL_MEM_READ_ONLY, 
+#     'USE_HOST_PTR': _lib.CL_MEM_USE_HOST_PTR, 
+#     'ALLOC_HOST_PTR': _lib.CL_MEM_ALLOC_HOST_PTR, 
+#     'COPY_HOST_PTR': _lib.CL_MEM_COPY_HOST_PTR,
+#     # TODO_PLAT
+#     # #ifdef cl_amd_device_memory_flags
+#     #     'USE_PERSISTENT_MEM_AMD': _lib.CL_MEM_USE_PERSISTENT_MEM_AMD, 
+#     # #endif
+#     }
+# if _CL_VERSION >= (1, 2):
+#     _mem_flags_constants.update({
+#         'HOST_WRITE_ONLY': _lib.CL_MEM_HOST_WRITE_ONLY, 
+#         'HOST_READ_ONLY': _lib.CL_MEM_HOST_READ_ONLY, 
+#         'HOST_NO_ACCESS': _lib.CL_MEM_HOST_NO_ACCESS,
+#     })
+
+# # re-implementation
+# mem_flags = type("mem_flags", (NOINIT,), _mem_flags_constants)
+
+class EQUALITY_TESTS(object):
+    def __eq__(self, other):
+        return hash(self) == hash(other)
+
+class Device(EQUALITY_TESTS):
+    def __init__(self):
+        pass
+
+    def __hash__(self):
+        return _lib.device__hash(self.ptr)
+
+    # todo: __del__
+
+    def get_info(self, param):
+        if param == 4145:
+            return self.__dict__["platform"] # TODO HACK
+        value = _ffi.new('char **')
+        _lib.device__get_info(self.ptr, param, value)
+        return _ffi.string(value[0])
+
+def _create_device(ptr):
+    device = Device()
+    device.ptr = ptr
+    return device
+
+def _parse_context_properties(properties):
+    props = []
+    if properties is None:
+        return _ffi.NULL
+
+    for prop_tuple in properties:
+        if len(prop_tuple) != 2:
+            raise CLRuntimeError("Context", _lib.CL_INVALID_VALUE, "property tuple must have length 2")
+        prop, value = prop_tuple
+        props.append(prop)
+        if prop == _lib.CL_CONTEXT_PLATFORM:
+            props.append(_ffi.cast('cl_context_properties', value.data()))
+            
+        else: # TODO_PLAT CL_WGL_HDC_KHR and morecc
+            raise CLRuntimeError("Context", _lib.CL_INVALID_VALUE, "invalid context property")
+    props.append(0)
+    return _ffi.new('cl_context_properties[]', props)
+
+        
+class Context(object):
+    def __init__(self, devices=None, properties=None, dev_type=None):
+        c_props = _parse_context_properties(properties)
+        status_code = _ffi.new('cl_int *')
+        
+        # from device list
+        if devices is not None:
+            if dev_type is not None:
+                raise CLRuntimeError("Context", _lib.CL_INVALID_VALUE, "one of 'devices' or 'dev_type' must be None")
+            ptr_devices = _ffi.new('cl_device_id[]', [device.ptr for device in devices])
+            ptr_ctx = _ffi.new('void **')
+            _lib._create_context(ptr_ctx, c_props, len(ptr_devices), _ffi.cast('void**', ptr_devices))
+            
+        else: # from dev_type
+            raise NotImplementedError()
+
+        self.ptr = ptr_ctx[0]
+
+    def get_info(self, param):
+        return 'TODO'
+
+class CommandQueue(object):
+    def __init__(self, context, device=None, properties=None):
+        if properties is None:
+            properties = 0
+        ptr_command_queue = _ffi.new('void **')
+        _lib._create_command_queue(ptr_command_queue, context.ptr, _ffi.NULL if device is None else device.ptr, properties)
+        self.ptr = ptr_command_queue[0]
+
+    def get_info(self, param):
+        print param
+        raise NotImplementedError()
+
+class MemoryObjectHolder(object):
+    def get_info(self, param):
+        info = _ffi.new('generic_info *')
+        _lib.memory_object_holder__get_info(self.ptr, param, info)
+        return _generic_info_to_python(info)
+
+class MemoryObject(MemoryObjectHolder):
+    pass
+        
+class Buffer(MemoryObjectHolder):
+    def __init__(self, context, flags, size=0, hostbuf=None):
+        if hostbuf is not None and not (flags & (_lib.CL_MEM_USE_HOST_PTR | _lib.CL_MEM_COPY_HOST_PTR)):
+            warnings.warn("'hostbuf' was passed, but no memory flags to make use of it.")
+        c_hostbuf = _ffi.NULL
+        if hostbuf is not None:
+            # todo: buffer protocol; for now hostbuf is assumed to be a numpy array
+            c_hostbuf = _ffi.cast('void *', hostbuf.ctypes.data)
+            hostbuf_size = hostbuf.nbytes
+            if size > hostbuf_size:
+                raise CLRuntimeError("Buffer", _lib.CL_INVALID_VALUE, "specified size is greater than host buffer size")
+            if size == 0:
+                size = hostbuf_size
+
+        ptr_buffer = _ffi.new('void **')
+        _lib._create_buffer(ptr_buffer, context.ptr, flags, size, c_hostbuf)
+        self.ptr = ptr_buffer[0]
+
+class _Program(object):
+    def __init__(self, *args):
+        if len(args) == 2:
+            self._init_source(*args)
+        else:
+            self._init_binary(*args)
+
+    def int_ptr(self):
+        raise NotImplementedError()
+
+    def from_int_ptr(self, int_ptr_value):
+        raise NotImplementedError()
+            
+    def _init_source(self, context, src):
+        ptr_program = _ffi.new('void **')
+        _lib._create_program_with_source(ptr_program, context.ptr, _ffi.new('char[]', src))
+        self.ptr = ptr_program[0]
+
+    def _init_binary(self, context, devices, binaries):
+        if len(devices) != len(binaries):
+            raise CLRuntimeError("create_program_with_binary", _lib.CL_INVALID_VALUE, "device and binary counts don't match")
+            
+        ptr_program = _ffi.new('void **')
+        ptr_devices = _ffi.new('void*[]', [device.ptr for device in devices])
+        ptr_binaries = _ffi.new('char*[]', len(binaries))
+        for i, binary in enumerate(binaries):
+            ptr_binaries[i] = _ffi.new('char[]', binary)
+        _lib._create_program_with_binary(ptr_program, context.ptr, len(ptr_devices), ptr_devices, len(ptr_binaries), ptr_binaries)
+        self.ptr = ptr_program[0]
+
+    def kind(self):
+        kind = _ffi.new('int *')
+        _lib.program__kind(self.ptr, kind)
+        return kind[0]
+
+    def _build(self, options=None, devices=None):
+        if devices is None: raise NotImplementedError()
+        # TODO: if devices is None, create them
+        if options is None:
+            options = ""
+        ptr_devices = _ffi.new('void*[]', [device.ptr for device in devices])
+        _lib.program__build(self.ptr, _ffi.new('char[]', options), len(ptr_devices), _ffi.cast('void**', ptr_devices))
+
+    def get_info(self, param):
+        if param == program_info.DEVICES:
+            # todo: refactor, same code as in get_devices 
+            devices = PP(_ffi.new('void**'))
+            _lib.program__get_info__devices(self.ptr, devices.ptr, devices.size)
+            result = []
+            for i in xrange(devices.size[0]):
+                # TODO why is the cast needed? 
+                device_ptr = _ffi.cast('void**', devices.ptr[0])[i]
+                result.append(_create_device(device_ptr))
+            return result
+        elif param == program_info.BINARIES:
+            # TODO possible memory leak? the char arrays might not be freed
+            ptr_binaries = PP(_ffi.new('char***'))
+            _lib.program__get_info__binaries(self.ptr, ptr_binaries.ptr, ptr_binaries.size)
+            return map(_ffi.string, ptr_binaries)
+        print param
+        raise NotImplementedError()
+        
+class Platform(object):
+    def __init__(self):
+        pass
+
+    # todo: __del__
+
+    def get_info(self, param):
+        value = _ffi.new('char **')
+        _lib.platform__get_info(self.ptr, param, value)
+        return _ffi.string(value[0])
+    
+    def get_devices(self, device_type=device_type.ALL):
+        devices = PP(_ffi.new('void**'))
+        _lib.platform__get_devices(self.ptr, devices.ptr, devices.size, device_type)
+        result = []
+        for i in xrange(devices.size[0]):
+            # TODO why is the cast needed? 
+            device_ptr = _ffi.cast('void**', devices.ptr[0])[i]
+            result.append(_create_device(device_ptr))
+        # TODO remove, should be done via get_info(PLATFORM)
+        for r in result:
+            r.__dict__["platform"] = self
+        return result
+
+def _create_platform(ptr):
+    platform = Platform()
+    platform.ptr = ptr
+    return platform
+
+def _generic_info_to_python(info):
+    for type_ in ('cl_uint',
+                  'cl_mem_object_type',
+                  ):
+        if info.type == getattr(_lib, 'generic_info_type_%s' % type_):
+            return getattr(info.value, '_%s' % type_)
+    raise NotImplementedError(info.type)
+
+class Kernel(object):
+    def __init__(self, program, name):
+        ptr_kernel = _ffi.new('void **')
+        _lib._create_kernel(ptr_kernel, program.ptr, name)
+        self.ptr = ptr_kernel[0]
+
+    def get_info(self, param):
+        info = _ffi.new('generic_info *')
+        _lib.kernel__get_info(self.ptr, param, info)
+        return _generic_info_to_python(info)
+        #raise NotImplementedError()
+
+    def set_arg(self, arg_index, arg):
+        if isinstance(arg, Buffer):
+            _lib.kernel__set_arg_mem_buffer(self.ptr, arg_index, arg.ptr)
+        else:
+            raise NotImplementedError()
+    
+def get_platforms():
+    platforms = PP(_ffi.new('void**'))
+    _lib.get_platforms(platforms.ptr, platforms.size)
+    result = []
+    for i in xrange(platforms.size[0]):
+        # TODO why is the cast needed? 
+        platform_ptr = _ffi.cast('void**', platforms.ptr[0])[i]
+        result.append(_create_platform(platform_ptr))
+        
+    return result
+
+class Event(object):
+    def __init__(self):
+        pass
+
+    def get_info(self, param):
+        print param
+        raise NotImplementedError()
+    
+def _create_event(ptr):
+    event = Event()
+    event.ptr = ptr
+    return event
+
+
+def enqueue_nd_range_kernel(queue, kernel, global_work_size, local_work_size, global_work_offset=None, wait_for=None, g_times_l=False):
+    if wait_for is not None:
+        raise NotImplementedError("wait_for")
+    work_dim = len(global_work_size)
+    
+    if local_work_size is not None:
+        if g_times_l:
+            work_dim = max(work_dim, len(local_work_size))
+        elif work_dim != len(local_work_size):
+            raise CLRuntimeError("enqueue_nd_range_kernel", _lib.CL_INVALID_VALUE,
+                                 "global/local work sizes have differing dimensions")
+        
+        local_work_size = list(local_work_size)
+        
+        if len(local_work_size) < work_dim:
+            local_work_size.extend([1] * (work_dim - len(local_work_size)))
+        if len(global_work_size) < work_dim:
+            global_work_size.extend([1] * (work_dim - len(global_work_size)))
+            
+    elif g_times_l:
+        for i in xrange(work_dim):
+            global_work_size[i] *= local_work_size[i]
+
+    if global_work_offset is not None:
+        raise NotImplementedError("global_work_offset")
+
+    c_global_work_offset = _ffi.NULL
+    c_global_work_size = _ffi.new('const size_t[]', global_work_size)
+    if local_work_size is None:
+        c_local_work_size = _ffi.NULL
+    else:
+        c_local_work_size = _ffi.new('const size_t[]', local_work_size)
+
+    ptr_event = _ffi.new('void **')
+    _lib._enqueue_nd_range_kernel(
+        ptr_event,
+        queue.ptr,
+        kernel.ptr,
+        work_dim,
+        c_global_work_offset,
+        c_global_work_size,
+        c_local_work_size
+    )
+    return _create_event(ptr_event[0])
+
+def _enqueue_read_buffer(cq, mem, buf, device_offset=0, is_blocking=True):
+    # assume numpy
+    c_buf = _ffi.cast('void *', buf.ctypes.data)
+    size = buf.nbytes
+    ptr_event = _ffi.new('void **')
+    _lib._enqueue_read_buffer(
+        ptr_event,
+        cq.ptr,
+        mem.ptr,
+        c_buf,
+        size,
+        device_offset,
+        bool(is_blocking)
+    )
+    return _create_event(ptr_event[0])
+
+
+
diff --git a/src/c_wrapper/Makefile b/src/c_wrapper/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..7114b1a32b682e2c1259e711f96405c98cada9ce
--- /dev/null
+++ b/src/c_wrapper/Makefile
@@ -0,0 +1,5 @@
+all:
+	g++ -c -Wall -DPYOPENCL_PRETEND_CL_VERSION=4112 -Werror -fpic wrap_cl.c
+	g++ -shared -o libwrapcl.so wrap_cl.o
+	cp libwrapcl.so ../../pyopencl/
+	cp wrap_cl_core.h ../../pyopencl/wrap_cl_core.h
diff --git a/src/c_wrapper/wrap_cl.c b/src/c_wrapper/wrap_cl.c
new file mode 100644
index 0000000000000000000000000000000000000000..e473f21cd853879274f38296a83ceb82b0e53fe2
--- /dev/null
+++ b/src/c_wrapper/wrap_cl.c
@@ -0,0 +1,2156 @@
+#include "wrap_cl.h"
+#include <stdlib.h>
+#include <vector>
+#include <iostream>
+#include <stdexcept>
+#include <string.h>
+#include <memory>
+
+#define MALLOC(TYPE, VAR, N) TYPE* VAR = reinterpret_cast<TYPE*>(malloc(sizeof(TYPE)*(N)));
+
+// {{{ tracing and error reporting
+#ifdef PYOPENCL_TRACE
+#define PYOPENCL_PRINT_CALL_TRACE(NAME)		\
+  std::cerr << NAME << std::endl;
+#define PYOPENCL_PRINT_CALL_TRACE_INFO(NAME, EXTRA_INFO)	\
+  std::cerr << NAME << " (" << EXTRA_INFO << ')' << std::endl;
+#else
+#define PYOPENCL_PRINT_CALL_TRACE(NAME) /*nothing*/
+#define PYOPENCL_PRINT_CALL_TRACE_INFO(NAME, EXTRA_INFO) /*nothing*/
+#endif
+
+// TODO Py_BEGIN_ALLOW_THREADS \ Py_END_ALLOW_THREADS below
+#define PYOPENCL_CALL_GUARDED_THREADED(NAME, ARGLIST)	\
+  {							\
+    PYOPENCL_PRINT_CALL_TRACE(#NAME);			\
+    cl_int status_code;					\
+    status_code = NAME ARGLIST;				\
+    if (status_code != CL_SUCCESS)			\
+      throw pyopencl::error(#NAME, status_code);	\
+  }
+
+
+#define PYOPENCL_CALL_GUARDED(NAME, ARGLIST)		\
+  {							\
+    PYOPENCL_PRINT_CALL_TRACE(#NAME);			\
+    cl_int status_code;					\
+    status_code = NAME ARGLIST;				\
+    if (status_code != CL_SUCCESS)			\
+      throw pyopencl::error(#NAME, status_code);	\
+  }
+
+// }}}
+
+#define PYOPENCL_GET_VEC_INFO(WHAT, FIRST_ARG, SECOND_ARG, RES_VEC) \
+  { \
+    size_t size; \
+    PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \
+        (FIRST_ARG, SECOND_ARG, 0, 0, &size)); \
+    \
+    RES_VEC.resize(size / sizeof(RES_VEC.front())); \
+    \
+    PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \
+        (FIRST_ARG, SECOND_ARG, size, \
+         RES_VEC.empty( ) ? NULL : &RES_VEC.front(), &size)); \
+  }
+
+#define PYOPENCL_WAITLIST_ARGS \
+    num_events_in_wait_list, event_wait_list.empty( ) ? NULL : &event_wait_list.front()
+
+
+#define PYOPENCL_GET_STR_INFO(WHAT, FIRST_ARG, SECOND_ARG)		\
+  {									\
+    size_t param_value_size;						\
+    PYOPENCL_CALL_GUARDED(clGet##WHAT##Info,				\
+			  (FIRST_ARG, SECOND_ARG, 0, 0, &param_value_size)); \
+									\
+    std::vector<char> param_value(param_value_size);			\
+    PYOPENCL_CALL_GUARDED(clGet##WHAT##Info,				\
+			  (FIRST_ARG, SECOND_ARG, param_value_size,	\
+			   param_value.empty( ) ? NULL : &param_value.front(), &param_value_size)); \
+									\
+    return param_value.empty( ) ? "" : std::string(&param_value.front(), param_value_size-1); \
+  }
+
+#define PYOPENCL_GET_INTEGRAL_INFO(WHAT, FIRST_ARG, SECOND_ARG, TYPE)	\
+  {									\
+  TYPE param_value;							\
+  PYOPENCL_CALL_GUARDED(clGet##WHAT##Info,				\
+			(FIRST_ARG, SECOND_ARG, sizeof(param_value), &param_value, 0)); \
+  generic_info info;							\
+  info.type = generic_info_type_##TYPE;					\
+  info.value._##TYPE = param_value;					\
+  return info;								\
+}
+
+
+#define PYOPENCL_RETURN_NEW_EVENT(evt) \
+    try \
+    { \
+      return new event(evt, false); \
+    } \
+    catch (...) \
+    { \
+      clReleaseEvent(evt); \
+      throw; \
+    }
+
+
+// {{{ equality testing
+#define PYOPENCL_EQUALITY_TESTS(cls) \
+    bool operator==(cls const &other) const \
+    { return data() == other.data(); } \
+    bool operator!=(cls const &other) const \
+    { return data() != other.data(); } \
+    long hash() const \
+    { return (long) (intptr_t) data(); }
+// }}}
+
+
+
+// {{{ tools
+#define PYOPENCL_CAST_BOOL(B) ((B) ? CL_TRUE : CL_FALSE)
+
+#define PYOPENCL_PARSE_PY_DEVICES \
+    std::vector<cl_device_id> devices_vec; \
+    cl_uint num_devices; \
+    cl_device_id *devices; \
+    \
+    if (py_devices.ptr() == Py_None) \
+    { \
+      num_devices = 0; \
+      devices = 0; \
+    } \
+    else \
+    { \
+      PYTHON_FOREACH(py_dev, py_devices) \
+        devices_vec.push_back( \
+            py::extract<device &>(py_dev)().data()); \
+      num_devices = devices_vec.size(); \
+      devices = devices_vec.empty( ) ? NULL : &devices_vec.front(); \
+    } \
+
+
+
+#define PYOPENCL_RETRY_IF_MEM_ERROR(OPERATION) \
+  { \
+    bool failed_with_mem_error = false; \
+    try \
+    { \
+      OPERATION \
+    } \
+    catch (pyopencl::error &e) \
+    { \
+      failed_with_mem_error = true; \
+      if (!e.is_out_of_memory()) \
+        throw; \
+    } \
+    \
+    if (failed_with_mem_error) \
+    { \
+      /* If we get here, we got an error from CL.
+       * We should run the Python GC to try and free up
+       * some memory references. */ \
+      run_python_gc(); \
+      \
+      /* Now retry the allocation. If it fails again,
+       * let it fail. */ \
+      { \
+        OPERATION \
+      } \
+    } \
+  }
+
+// }}}
+
+
+int get_cl_version(void) {
+  return PYOPENCL_CL_VERSION;
+}
+
+extern "C"
+namespace pyopencl
+{
+  char* _copy_str(const std::string& str) {
+    MALLOC(char, cstr, str.size() + 1);
+    strcpy(cstr, str.c_str());
+    return cstr;
+  }
+  
+  // {{{ error
+  class error : public std::runtime_error
+  {
+  private:
+    const char *m_routine;
+    cl_int m_code;
+
+  public:
+    error(const char *rout, cl_int c, const char *msg="")
+      : std::runtime_error(msg), m_routine(rout), m_code(c)
+    { std::cout << rout <<";" << msg<< ";" << c << std::endl; }
+
+    const char *routine() const
+    {
+      return m_routine;
+    }
+
+    cl_int code() const
+    {
+      return m_code;
+    }
+
+    bool is_out_of_memory() const
+    {
+      return (code() == CL_MEM_OBJECT_ALLOCATION_FAILURE
+	      || code() == CL_OUT_OF_RESOURCES
+	      || code() == CL_OUT_OF_HOST_MEMORY);
+    }
+
+  };
+
+  // }}}
+
+
+  //#define MAKE_INFO(name, type, value) {  }
+
+  
+  // {{{ event/synchronization
+  class event // : boost::noncopyable
+  {
+    private:
+      cl_event m_event;
+
+    public:
+      event(cl_event event, bool retain)
+        : m_event(event)
+      {
+        if (retain)
+          PYOPENCL_CALL_GUARDED(clRetainEvent, (event));
+      }
+
+      event(event const &src)
+        : m_event(src.m_event)
+      { PYOPENCL_CALL_GUARDED(clRetainEvent, (m_event)); }
+
+      virtual ~event()
+      {
+	// todo
+        // PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseEvent,
+        //     (m_event));
+      }
+
+      const cl_event data() const
+      { return m_event; }
+
+      PYOPENCL_EQUALITY_TESTS(event);
+
+      // py::object get_info(cl_event_info param_name) const
+//       {
+//         switch (param_name)
+//         {
+//           case CL_EVENT_COMMAND_QUEUE:
+//             PYOPENCL_GET_OPAQUE_INFO(Event, m_event, param_name,
+//                 cl_command_queue, command_queue);
+//           case CL_EVENT_COMMAND_TYPE:
+//             PYOPENCL_GET_INTEGRAL_INFO(Event, m_event, param_name,
+//                 cl_command_type);
+//           case CL_EVENT_COMMAND_EXECUTION_STATUS:
+//             PYOPENCL_GET_INTEGRAL_INFO(Event, m_event, param_name,
+//                 cl_int);
+//           case CL_EVENT_REFERENCE_COUNT:
+//             PYOPENCL_GET_INTEGRAL_INFO(Event, m_event, param_name,
+//                 cl_uint);
+// #if PYOPENCL_CL_VERSION >= 0x1010
+//           case CL_EVENT_CONTEXT:
+//             PYOPENCL_GET_OPAQUE_INFO(Event, m_event, param_name,
+//                 cl_context, context);
+// #endif
+
+//           default:
+//             throw error("Event.get_info", CL_INVALID_VALUE);
+//         }
+//       }
+
+      // py::object get_profiling_info(cl_profiling_info param_name) const
+      // {
+      //   switch (param_name)
+      //   {
+      //     case CL_PROFILING_COMMAND_QUEUED:
+      //     case CL_PROFILING_COMMAND_SUBMIT:
+      //     case CL_PROFILING_COMMAND_START:
+      //     case CL_PROFILING_COMMAND_END:
+      //       PYOPENCL_GET_INTEGRAL_INFO(EventProfiling, m_event, param_name,
+      //           cl_ulong);
+      //     default:
+      //       throw error("Event.get_profiling_info", CL_INVALID_VALUE);
+      //   }
+      // }
+
+      virtual void wait()
+      {
+        PYOPENCL_CALL_GUARDED_THREADED(clWaitForEvents, (1, &m_event));
+      }
+  };
+
+      // }}}
+
+
+
+  // {{{ platform
+  class platform
+  {
+  private:
+    cl_platform_id m_platform;
+
+  public:
+    platform(cl_platform_id pid)
+      : m_platform(pid)
+    { }
+
+    platform(cl_platform_id pid, bool /*retain (ignored)*/)
+      : m_platform(pid)
+    { }
+
+    cl_platform_id data() const
+    {
+      return m_platform;
+    }
+
+    // TODO
+    // PYOPENCL_EQUALITY_TESTS(platform);
+
+    std::string get_info(cl_platform_info param_name) const
+    {
+      switch (param_name)
+	{
+	case CL_PLATFORM_PROFILE:
+	case CL_PLATFORM_VERSION:
+	case CL_PLATFORM_NAME:
+	case CL_PLATFORM_VENDOR:
+#if !(defined(CL_PLATFORM_NVIDIA) && CL_PLATFORM_NVIDIA == 0x3001)
+	case CL_PLATFORM_EXTENSIONS:
+#endif
+	  PYOPENCL_GET_STR_INFO(Platform, m_platform, param_name);
+	  
+	default:
+	  throw error("Platform.get_info", CL_INVALID_VALUE);
+	}
+    }
+
+    std::vector<cl_device_id> get_devices(cl_device_type devtype);
+  }; 
+
+  
+  inline std::vector<cl_device_id> platform::get_devices(cl_device_type devtype)
+  {
+    cl_uint num_devices = 0;
+    PYOPENCL_CALL_GUARDED(clGetDeviceIDs,
+			  (m_platform, devtype, 0, 0, &num_devices));
+
+    std::vector<cl_device_id> devices(num_devices);
+    PYOPENCL_CALL_GUARDED(clGetDeviceIDs,
+			  (m_platform, devtype,
+			   num_devices, devices.empty( ) ? NULL : &devices.front(), &num_devices));
+    
+    return devices;
+  }
+
+  // }}}
+
+  // {{{ device
+  class device // : boost::noncopyable
+  {
+  public:
+    enum reference_type_t {
+      REF_NOT_OWNABLE,
+      REF_FISSION_EXT,
+#if PYOPENCL_CL_VERSION >= 0x1020
+      REF_CL_1_2,
+#endif
+    };
+  private:
+    cl_device_id m_device;
+    reference_type_t m_ref_type;
+
+  public:
+    device(cl_device_id did)
+      : m_device(did), m_ref_type(REF_NOT_OWNABLE)
+    { }
+
+    device(cl_device_id did, bool retain, reference_type_t ref_type=REF_NOT_OWNABLE)
+      : m_device(did), m_ref_type(ref_type)
+    {
+      if (retain && ref_type != REF_NOT_OWNABLE)
+        {
+          if (false)
+	    { }
+#if (defined(cl_ext_device_fission) && defined(PYOPENCL_USE_DEVICE_FISSION))
+          else if (ref_type == REF_FISSION_EXT)
+	    {
+#if PYOPENCL_CL_VERSION >= 0x1020
+	      cl_platform_id plat;
+	      PYOPENCL_CALL_GUARDED(clGetDeviceInfo, (m_device, CL_DEVICE_PLATFORM,
+						      sizeof(plat), &plat, NULL));
+#endif
+
+	      PYOPENCL_GET_EXT_FUN(plat,
+				   clRetainDeviceEXT, retain_func);
+
+	      PYOPENCL_CALL_GUARDED(retain_func, (did));
+	    }
+#endif
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+          else if (ref_type == REF_CL_1_2)
+	    {
+	      PYOPENCL_CALL_GUARDED(clRetainDevice, (did));
+	    }
+#endif
+
+          else
+            throw error("Device", CL_INVALID_VALUE,
+			"cannot own references to devices when device fission or CL 1.2 is not available");
+        }
+    }
+
+    ~device()
+    {
+      if (false)
+        { }
+#if defined(cl_ext_device_fission) && defined(PYOPENCL_USE_DEVICE_FISSION)
+      else if (m_ref_type == REF_FISSION_EXT)
+        {
+#if PYOPENCL_CL_VERSION >= 0x1020
+          cl_platform_id plat;
+          PYOPENCL_CALL_GUARDED(clGetDeviceInfo, (m_device, CL_DEVICE_PLATFORM,
+						  sizeof(plat), &plat, NULL));
+#endif
+
+          PYOPENCL_GET_EXT_FUN(plat,
+			       clReleaseDeviceEXT, release_func);
+
+          PYOPENCL_CALL_GUARDED_CLEANUP(release_func, (m_device));
+        }
+#endif
+
+#if PYOPENCL_CL_VERSION >= 0x1020
+      else if (m_ref_type == REF_CL_1_2)
+	PYOPENCL_CALL_GUARDED(clReleaseDevice, (m_device));
+#endif
+    }
+
+    cl_device_id data() const
+    {
+      return m_device;
+    }
+
+    PYOPENCL_EQUALITY_TESTS(device);
+
+    std::string get_info(cl_device_info param_name) const
+    {
+#define DEV_GET_INT_INF(TYPE) PYOPENCL_GET_INTEGRAL_INFO(Device, m_device, param_name, TYPE);
+
+      switch (param_name)
+        {
+          // case CL_DEVICE_TYPE: DEV_GET_INT_INF(cl_device_type);
+	  //           case CL_DEVICE_VENDOR_ID: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_MAX_COMPUTE_UNITS: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_MAX_WORK_GROUP_SIZE: DEV_GET_INT_INF(size_t);
+
+	  //           case CL_DEVICE_MAX_WORK_ITEM_SIZES:
+	  //             {
+	  //               std::vector<size_t> result;
+	  //               PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result);
+	  //               PYOPENCL_RETURN_VECTOR(size_t, result);
+	  //             }
+
+	  //           case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE: DEV_GET_INT_INF(cl_uint);
+
+	  //           case CL_DEVICE_MAX_CLOCK_FREQUENCY: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_ADDRESS_BITS: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_MAX_READ_IMAGE_ARGS: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_MAX_WRITE_IMAGE_ARGS: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_MAX_MEM_ALLOC_SIZE: DEV_GET_INT_INF(cl_ulong);
+	  //           case CL_DEVICE_IMAGE2D_MAX_WIDTH: DEV_GET_INT_INF(size_t);
+	  //           case CL_DEVICE_IMAGE2D_MAX_HEIGHT: DEV_GET_INT_INF(size_t);
+	  //           case CL_DEVICE_IMAGE3D_MAX_WIDTH: DEV_GET_INT_INF(size_t);
+	  //           case CL_DEVICE_IMAGE3D_MAX_HEIGHT: DEV_GET_INT_INF(size_t);
+	  //           case CL_DEVICE_IMAGE3D_MAX_DEPTH: DEV_GET_INT_INF(size_t);
+	  //           case CL_DEVICE_IMAGE_SUPPORT: DEV_GET_INT_INF(cl_bool);
+	  //           case CL_DEVICE_MAX_PARAMETER_SIZE: DEV_GET_INT_INF(size_t);
+	  //           case CL_DEVICE_MAX_SAMPLERS: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_MEM_BASE_ADDR_ALIGN: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_SINGLE_FP_CONFIG: DEV_GET_INT_INF(cl_device_fp_config);
+	  // #ifdef CL_DEVICE_DOUBLE_FP_CONFIG
+	  //           case CL_DEVICE_DOUBLE_FP_CONFIG: DEV_GET_INT_INF(cl_device_fp_config);
+	  // #endif
+	  // #ifdef CL_DEVICE_HALF_FP_CONFIG
+	  //           case CL_DEVICE_HALF_FP_CONFIG: DEV_GET_INT_INF(cl_device_fp_config);
+	  // #endif
+
+	  //           case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE: DEV_GET_INT_INF(cl_device_mem_cache_type);
+	  //           case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_GLOBAL_MEM_CACHE_SIZE: DEV_GET_INT_INF(cl_ulong);
+	  //           case CL_DEVICE_GLOBAL_MEM_SIZE: DEV_GET_INT_INF(cl_ulong);
+
+	  //           case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: DEV_GET_INT_INF(cl_ulong);
+	  //           case CL_DEVICE_MAX_CONSTANT_ARGS: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_LOCAL_MEM_TYPE: DEV_GET_INT_INF(cl_device_local_mem_type);
+	  //           case CL_DEVICE_LOCAL_MEM_SIZE: DEV_GET_INT_INF(cl_ulong);
+	  //           case CL_DEVICE_ERROR_CORRECTION_SUPPORT: DEV_GET_INT_INF(cl_bool);
+	  //           case CL_DEVICE_PROFILING_TIMER_RESOLUTION: DEV_GET_INT_INF(size_t);
+	  //           case CL_DEVICE_ENDIAN_LITTLE: DEV_GET_INT_INF(cl_bool);
+	  //           case CL_DEVICE_AVAILABLE: DEV_GET_INT_INF(cl_bool);
+	  //           case CL_DEVICE_COMPILER_AVAILABLE: DEV_GET_INT_INF(cl_bool);
+	  //           case CL_DEVICE_EXECUTION_CAPABILITIES: DEV_GET_INT_INF(cl_device_exec_capabilities);
+	  //           case CL_DEVICE_QUEUE_PROPERTIES: DEV_GET_INT_INF(cl_command_queue_properties);
+
+	case CL_DEVICE_NAME:
+	case CL_DEVICE_VENDOR:
+	case CL_DRIVER_VERSION:
+	case CL_DEVICE_PROFILE:
+	case CL_DEVICE_VERSION:
+	case CL_DEVICE_EXTENSIONS:
+	  PYOPENCL_GET_STR_INFO(Device, m_device, param_name);
+
+	  //           case CL_DEVICE_PLATFORM:
+	  //             PYOPENCL_GET_OPAQUE_INFO(Device, m_device, param_name, cl_platform_id, platform);
+
+	  // #if PYOPENCL_CL_VERSION >= 0x1010
+	  //           case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF: DEV_GET_INT_INF(cl_uint);
+
+	  //           case CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_NATIVE_VECTOR_WIDTH_INT: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF: DEV_GET_INT_INF(cl_uint);
+
+	  //           case CL_DEVICE_HOST_UNIFIED_MEMORY: DEV_GET_INT_INF(cl_bool);
+	  //           case CL_DEVICE_OPENCL_C_VERSION:
+	  //             PYOPENCL_GET_STR_INFO(Device, m_device, param_name);
+	  // #endif
+	  // #ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+	  //           case CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV:
+	  //           case CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV:
+	  //           case CL_DEVICE_REGISTERS_PER_BLOCK_NV:
+	  //           case CL_DEVICE_WARP_SIZE_NV:
+	  //             DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_GPU_OVERLAP_NV:
+	  //           case CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV:
+	  //           case CL_DEVICE_INTEGRATED_MEMORY_NV:
+	  //             DEV_GET_INT_INF(cl_bool);
+	  // #endif
+	  // #if defined(cl_ext_device_fission) && defined(PYOPENCL_USE_DEVICE_FISSION)
+	  //           case CL_DEVICE_PARENT_DEVICE_EXT:
+	  //             PYOPENCL_GET_OPAQUE_INFO(Device, m_device, param_name, cl_device_id, device);
+	  //           case CL_DEVICE_PARTITION_TYPES_EXT:
+	  //           case CL_DEVICE_AFFINITY_DOMAINS_EXT:
+	  //           case CL_DEVICE_PARTITION_STYLE_EXT:
+	  //             {
+	  //               std::vector<cl_device_partition_property_ext> result;
+	  //               PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result);
+	  //               PYOPENCL_RETURN_VECTOR(cl_device_partition_property_ext, result);
+	  //             }
+	  //           case CL_DEVICE_REFERENCE_COUNT_EXT: DEV_GET_INT_INF(cl_uint);
+	  // #endif
+	  // #if PYOPENCL_CL_VERSION >= 0x1020
+	  //           case CL_DEVICE_LINKER_AVAILABLE: DEV_GET_INT_INF(cl_bool);
+	  //           case CL_DEVICE_BUILT_IN_KERNELS:
+	  //             PYOPENCL_GET_STR_INFO(Device, m_device, param_name);
+	  //           case CL_DEVICE_IMAGE_MAX_BUFFER_SIZE: DEV_GET_INT_INF(size_t);
+	  //           case CL_DEVICE_IMAGE_MAX_ARRAY_SIZE: DEV_GET_INT_INF(size_t);
+	  //           case CL_DEVICE_PARENT_DEVICE:
+	  //             PYOPENCL_GET_OPAQUE_INFO(Device, m_device, param_name, cl_device_id, device);
+	  //           case CL_DEVICE_PARTITION_MAX_SUB_DEVICES: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_PARTITION_TYPE:
+	  //           case CL_DEVICE_PARTITION_PROPERTIES:
+	  //             {
+	  //               std::vector<cl_device_partition_property> result;
+	  //               PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result);
+	  //               PYOPENCL_RETURN_VECTOR(cl_device_partition_property, result);
+	  //             }
+	  //           case CL_DEVICE_PARTITION_AFFINITY_DOMAIN:
+	  //             {
+	  //               std::vector<cl_device_affinity_domain> result;
+	  //               PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result);
+	  //               PYOPENCL_RETURN_VECTOR(cl_device_affinity_domain, result);
+	  //             }
+	  //           case CL_DEVICE_REFERENCE_COUNT: DEV_GET_INT_INF(cl_uint);
+	  //           case CL_DEVICE_PREFERRED_INTEROP_USER_SYNC: DEV_GET_INT_INF(cl_bool);
+	  //           case CL_DEVICE_PRINTF_BUFFER_SIZE: DEV_GET_INT_INF(cl_bool);
+	  // #endif
+	  // // {{{ AMD dev attrs
+	  // //
+	  // // types of AMD dev attrs divined from
+	  // // https://www.khronos.org/registry/cl/api/1.2/cl.hpp
+	  // #ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
+	  //           case CL_DEVICE_PROFILING_TIMER_OFFSET_AMD: DEV_GET_INT_INF(cl_ulong);
+	  // #endif
+	  // /* FIXME
+	  // #ifdef CL_DEVICE_TOPOLOGY_AMD
+	  //           case CL_DEVICE_TOPOLOGY_AMD:
+	  // #endif
+	  // */
+	  // #ifdef CL_DEVICE_BOARD_NAME_AMD
+	  //           case CL_DEVICE_BOARD_NAME_AMD: ;
+	  //             PYOPENCL_GET_STR_INFO(Device, m_device, param_name);
+	  // #endif
+	  // #ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
+	  //           case CL_DEVICE_GLOBAL_FREE_MEMORY_AMD:
+	  //             {
+	  //               std::vector<size_t> result;
+	  //               PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result);
+	  //               PYOPENCL_RETURN_VECTOR(size_t, result);
+	  //             }
+	  // #endif
+	  // #ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
+	  //           case CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD: DEV_GET_INT_INF(cl_uint);
+	  // #endif
+	  // #ifdef CL_DEVICE_SIMD_WIDTH_AMD
+	  //           case CL_DEVICE_SIMD_WIDTH_AMD: DEV_GET_INT_INF(cl_uint);
+	  // #endif
+	  // #ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
+	  //           case CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD: DEV_GET_INT_INF(cl_uint);
+	  // #endif
+	  // #ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
+	  //           case CL_DEVICE_WAVEFRONT_WIDTH_AMD: DEV_GET_INT_INF(cl_uint);
+	  // #endif
+	  // #ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
+	  //           case CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD: DEV_GET_INT_INF(cl_uint);
+	  // #endif
+	  // #ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
+	  //           case CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD: DEV_GET_INT_INF(cl_uint);
+	  // #endif
+	  // #ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
+	  //           case CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD: DEV_GET_INT_INF(cl_uint);
+	  // #endif
+	  // #ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
+	  //           case CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD: DEV_GET_INT_INF(cl_uint);
+	  // #endif
+	  // #ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
+	  //           case CL_DEVICE_LOCAL_MEM_BANKS_AMD: DEV_GET_INT_INF(cl_uint);
+	  // #endif
+	  // // }}}
+
+	  // #ifdef CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT
+	  //           case CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT: DEV_GET_INT_INF(cl_uint);
+	  // #endif
+
+	default:
+	  throw error("Device.get_info", CL_INVALID_VALUE);
+        }
+    }
+
+    // #if PYOPENCL_CL_VERSION >= 0x1020
+    //       py::list create_sub_devices(py::object py_properties)
+    //       {
+    //         std::vector<cl_device_partition_property> properties;
+
+    //         COPY_PY_LIST(cl_device_partition_property, properties);
+    //         properties.push_back(0);
+
+    //         cl_device_partition_property *props_ptr
+    //           = properties.empty( ) ? NULL : &properties.front();
+
+    //         cl_uint num_entries;
+    //         PYOPENCL_CALL_GUARDED(clCreateSubDevices,
+    //             (m_device, props_ptr, 0, NULL, &num_entries));
+
+    //         std::vector<cl_device_id> result;
+    //         result.resize(num_entries);
+
+    //         PYOPENCL_CALL_GUARDED(clCreateSubDevices,
+    //             (m_device, props_ptr, num_entries, &result.front(), NULL));
+
+    //         py::list py_result;
+    //         BOOST_FOREACH(cl_device_id did, result)
+    //           py_result.append(handle_from_new_ptr(
+    //                 new pyopencl::device(did, /*retain*/true,
+    //                   device::REF_CL_1_2)));
+    //         return py_result;
+    //       }
+    // #endif
+
+    // #if defined(cl_ext_device_fission) && defined(PYOPENCL_USE_DEVICE_FISSION)
+    //       py::list create_sub_devices_ext(py::object py_properties)
+    //       {
+    //         std::vector<cl_device_partition_property_ext> properties;
+
+    // #if PYOPENCL_CL_VERSION >= 0x1020
+    //         cl_platform_id plat;
+    //         PYOPENCL_CALL_GUARDED(clGetDeviceInfo, (m_device, CL_DEVICE_PLATFORM,
+    //               sizeof(plat), &plat, NULL));
+    // #endif
+
+    //         PYOPENCL_GET_EXT_FUN(plat, clCreateSubDevicesEXT, create_sub_dev);
+
+    //         COPY_PY_LIST(cl_device_partition_property_ext, properties);
+    //         properties.push_back(CL_PROPERTIES_LIST_END_EXT);
+
+    //         cl_device_partition_property_ext *props_ptr
+    //           = properties.empty( ) ? NULL : &properties.front();
+
+    //         cl_uint num_entries;
+    //         PYOPENCL_CALL_GUARDED(create_sub_dev,
+    //             (m_device, props_ptr, 0, NULL, &num_entries));
+
+    //         std::vector<cl_device_id> result;
+    //         result.resize(num_entries);
+
+    //         PYOPENCL_CALL_GUARDED(create_sub_dev,
+    //             (m_device, props_ptr, num_entries, &result.front(), NULL));
+
+    //         py::list py_result;
+    //         BOOST_FOREACH(cl_device_id did, result)
+    //           py_result.append(handle_from_new_ptr(
+    //                 new pyopencl::device(did, /*retain*/true,
+    //                   device::REF_FISSION_EXT)));
+    //         return py_result;
+    //       }
+    // #endif
+
+    
+  };
+
+  // }}}
+
+  
+
+  // {{{ context
+  class context // : public boost::noncopyable
+  {
+  private:
+    cl_context m_context;
+
+  public:
+    context(cl_context ctx, bool retain)
+      : m_context(ctx)
+    {
+      if (retain)
+	PYOPENCL_CALL_GUARDED(clRetainContext, (ctx));
+    }
+
+
+    ~context()
+    {
+      // TODO
+      // PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseContext,
+      // 			      (m_context));
+    }
+
+    cl_context data() const
+    {
+      return m_context;
+    }
+
+    //PYOPENCL_EQUALITY_TESTS(context);
+
+    //       py::object get_info(cl_context_info param_name) const
+    //       {
+    //         switch (param_name)
+    // 	  {
+    //           case CL_CONTEXT_REFERENCE_COUNT:
+    //             PYOPENCL_GET_INTEGRAL_INFO(
+    // 				       Context, m_context, param_name, cl_uint);
+
+    //           case CL_CONTEXT_DEVICES:
+    //             {
+    //               std::vector<cl_device_id> result;
+    //               PYOPENCL_GET_VEC_INFO(Context, m_context, param_name, result);
+
+    //               py::list py_result;
+    //               BOOST_FOREACH(cl_device_id did, result)
+    //                 py_result.append(handle_from_new_ptr(
+    // 						     new pyopencl::device(did)));
+    //               return py_result;
+    //             }
+
+    //           case CL_CONTEXT_PROPERTIES:
+    //             {
+    //               std::vector<cl_context_properties> result;
+    //               PYOPENCL_GET_VEC_INFO(Context, m_context, param_name, result);
+
+    //               py::list py_result;
+    //               for (size_t i = 0; i < result.size(); i+=2)
+    // 		{
+    // 		  cl_context_properties key = result[i];
+    // 		  py::object value;
+    // 		  switch (key)
+    // 		    {
+    // 		    case CL_CONTEXT_PLATFORM:
+    // 		      {
+    // 			value = py::object(
+    // 					   handle_from_new_ptr(new platform(
+    // 									    reinterpret_cast<cl_platform_id>(result[i+1]))));
+    // 			break;
+    // 		      }
+
+    // #if defined(PYOPENCL_GL_SHARING_VERSION) && (PYOPENCL_GL_SHARING_VERSION >= 1)
+    // #if defined(__APPLE__) && defined(HAVE_GL)
+    // 		    case CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE:
+    // #else
+    // 		    case CL_GL_CONTEXT_KHR:
+    // 		    case CL_EGL_DISPLAY_KHR:
+    // 		    case CL_GLX_DISPLAY_KHR:
+    // 		    case CL_WGL_HDC_KHR:
+    // 		    case CL_CGL_SHAREGROUP_KHR:
+    // #endif
+    // 		      value = py::object(result[i+1]);
+    // 		      break;
+
+    // #endif
+    // 		    case 0:
+    // 		      break;
+
+    // 		    default:
+    // 		      throw error("Context.get_info", CL_INVALID_VALUE,
+    // 				  "unknown context_property key encountered");
+    // 		    }
+
+    // 		  py_result.append(py::make_tuple(result[i], value));
+    // 		}
+    //               return py_result;
+    //             }
+
+    // #if PYOPENCL_CL_VERSION >= 0x1010
+    //           case CL_CONTEXT_NUM_DEVICES:
+    //             PYOPENCL_GET_INTEGRAL_INFO(
+    // 				       Context, m_context, param_name, cl_uint);
+    // #endif
+
+    //           default:
+    //             throw error("Context.get_info", CL_INVALID_VALUE);
+    // 	  }
+    //       }
+    //     };
+
+
+
+    // }}}
+
+  };
+
+  // {{{ command_queue
+  class command_queue
+  {
+  private:
+    cl_command_queue m_queue;
+
+  public:
+    command_queue(cl_command_queue q, bool retain)
+      : m_queue(q)
+    {
+      if (retain)
+	PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (q));
+    }
+
+    command_queue(command_queue const &src)
+      : m_queue(src.m_queue)
+    {
+      PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (m_queue));
+    }
+
+    command_queue(
+		  const context &ctx,
+		  const device *py_dev=0,
+		  cl_command_queue_properties props=0)
+    {
+      cl_device_id dev;
+      if (py_dev)
+	dev = py_dev->data();
+      else
+        {
+	  // TODO
+          // std::vector<cl_device_id> devs;
+          // PYOPENCL_GET_VEC_INFO(Context, ctx.data(), CL_CONTEXT_DEVICES, devs);
+          // if (devs.size() == 0)
+          //   throw pyopencl::error("CommandQueue", CL_INVALID_VALUE,
+          //       "context doesn't have any devices? -- don't know which one to default to");
+          // dev = devs[0];
+        }
+
+      cl_int status_code;
+      PYOPENCL_PRINT_CALL_TRACE("clCreateCommandQueue");
+      m_queue = clCreateCommandQueue(
+				     ctx.data(), dev, props, &status_code);
+
+      if (status_code != CL_SUCCESS) {
+	throw pyopencl::error("CommandQueue", status_code);
+      }
+    }
+
+    ~command_queue()
+    {
+      // TODO
+      // PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseCommandQueue,
+      //     (m_queue));
+    }
+
+    const cl_command_queue data() const
+    { return m_queue; }
+
+    // PYOPENCL_EQUALITY_TESTS(command_queue);
+
+    // py::object get_info(cl_command_queue_info param_name) const
+    // {
+    //   switch (param_name)
+    //   {
+    //     case CL_QUEUE_CONTEXT:
+    //       PYOPENCL_GET_OPAQUE_INFO(CommandQueue, m_queue, param_name,
+    //           cl_context, context);
+    //     case CL_QUEUE_DEVICE:
+    //       PYOPENCL_GET_OPAQUE_INFO(CommandQueue, m_queue, param_name,
+    //           cl_device_id, device);
+    //     case CL_QUEUE_REFERENCE_COUNT:
+    //       PYOPENCL_GET_INTEGRAL_INFO(CommandQueue, m_queue, param_name,
+    //           cl_uint);
+    //     case CL_QUEUE_PROPERTIES:
+    //       PYOPENCL_GET_INTEGRAL_INFO(CommandQueue, m_queue, param_name,
+    //           cl_command_queue_properties);
+
+    //     default:
+    //       throw error("CommandQueue.get_info", CL_INVALID_VALUE);
+    //   }
+    // }
+
+    std::auto_ptr<context> get_context() const
+    {
+      cl_context param_value;
+      PYOPENCL_CALL_GUARDED(clGetCommandQueueInfo,
+			    (m_queue, CL_QUEUE_CONTEXT, sizeof(param_value), &param_value, 0));
+      return std::auto_ptr<context>(
+				    new context(param_value, /*retain*/ true));
+    }
+
+#if PYOPENCL_CL_VERSION < 0x1010
+    cl_command_queue_properties set_property(
+					     cl_command_queue_properties prop,
+					     bool enable)
+    {
+      cl_command_queue_properties old_prop;
+      PYOPENCL_CALL_GUARDED(clSetCommandQueueProperty,
+			    (m_queue, prop, PYOPENCL_CAST_BOOL(enable), &old_prop));
+      return old_prop;
+    }
+#endif
+
+    void flush()
+    { PYOPENCL_CALL_GUARDED(clFlush, (m_queue)); }
+    void finish()
+    {
+      // TODO
+      // PYOPENCL_CALL_GUARDED_THREADED(clFinish, (m_queue));
+    }
+  };
+
+  // }}}
+
+
+  // {{{ memory_object
+
+  //py::object create_mem_object_wrapper(cl_mem mem);
+
+  class memory_object_holder
+  {
+    public:
+      virtual const cl_mem data() const = 0;
+
+    //PYOPENCL_EQUALITY_TESTS(memory_object_holder);
+
+      size_t size() const
+      {
+        size_t param_value;
+        PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
+            (data(), CL_MEM_SIZE, sizeof(param_value), &param_value, 0));
+        return param_value;
+      }
+
+    generic_info get_info(cl_mem_info param_name) {
+      switch (param_name){
+      case CL_MEM_TYPE:
+	PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+				   cl_mem_object_type);
+//       case CL_MEM_FLAGS:
+// 	PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+// 				   cl_mem_flags);
+//       case CL_MEM_SIZE:
+// 	PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+// 				   size_t);
+//       case CL_MEM_HOST_PTR:
+// 	throw pyopencl::error("MemoryObject.get_info", CL_INVALID_VALUE,
+// 			      "Use MemoryObject.get_host_array to get host pointer.");
+//       case CL_MEM_MAP_COUNT:
+// 	PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+// 				   cl_uint);
+//       case CL_MEM_REFERENCE_COUNT:
+// 	PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+// 				   cl_uint);
+//       case CL_MEM_CONTEXT:
+// 	PYOPENCL_GET_OPAQUE_INFO(MemObject, data(), param_name,
+// 				 cl_context, context);
+
+// #if PYOPENCL_CL_VERSION >= 0x1010
+//       case CL_MEM_ASSOCIATED_MEMOBJECT:
+// 	{
+// 	  cl_mem param_value;
+// 	  PYOPENCL_CALL_GUARDED(clGetMemObjectInfo, (data(), param_name, sizeof(param_value), &param_value, 0));
+// 	  if (param_value == 0)
+// 	    {
+// 	      // no associated memory object? no problem.
+// 	      return py::object();
+// 	    }
+
+// 	  return create_mem_object_wrapper(param_value);
+// 	}
+//       case CL_MEM_OFFSET:
+// 	PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+// 				   size_t);
+// #endif
+
+      default:
+	throw error("MemoryObjectHolder.get_info", CL_INVALID_VALUE);
+      }
+    }
+  };
+
+
+  class memory_object : /*boost::noncopyable, */ public memory_object_holder
+  {
+    private:
+      bool m_valid;
+      cl_mem m_mem;
+      void* m_hostbuf;
+
+    public:
+      memory_object(cl_mem mem, bool retain, void* hostbuf=0)
+        : m_valid(true), m_mem(mem)
+      {
+        if (retain)
+          PYOPENCL_CALL_GUARDED(clRetainMemObject, (mem));
+
+        if (hostbuf)
+          m_hostbuf = hostbuf;
+      }
+
+      memory_object(memory_object const &src)
+        : m_valid(true), m_mem(src.m_mem), m_hostbuf(src.m_hostbuf)
+      {
+        PYOPENCL_CALL_GUARDED(clRetainMemObject, (m_mem));
+      }
+
+      memory_object(memory_object_holder const &src)
+        : m_valid(true), m_mem(src.data())
+      {
+        PYOPENCL_CALL_GUARDED(clRetainMemObject, (m_mem));
+      }
+
+      void release()
+      {
+        if (!m_valid)
+            throw error("MemoryObject.free", CL_INVALID_VALUE,
+                "trying to double-unref mem object");
+	// TODO
+        //PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseMemObject, (m_mem));
+        m_valid = false;
+      }
+
+      virtual ~memory_object()
+      {
+        if (m_valid)
+          release();
+      }
+
+      void* hostbuf()
+      { return m_hostbuf; }
+
+      const cl_mem data() const
+      { return m_mem; }
+
+  };
+
+// #if PYOPENCL_CL_VERSION >= 0x1020
+//   inline
+//   event *enqueue_migrate_mem_objects(
+//       command_queue &cq,
+//       py::object py_mem_objects,
+//       cl_mem_migration_flags flags,
+//       py::object py_wait_for)
+//   {
+//     PYOPENCL_PARSE_WAIT_FOR;
+
+//     std::vector<cl_mem> mem_objects;
+//     PYTHON_FOREACH(mo, py_mem_objects)
+//       mem_objects.push_back(py::extract<memory_object &>(mo)().data());
+
+//     cl_event evt;
+//     PYOPENCL_RETRY_IF_MEM_ERROR(
+//       PYOPENCL_CALL_GUARDED(clEnqueueMigrateMemObjects, (
+//             cq.data(),
+//             mem_objects.size(), mem_objects.empty( ) ? NULL : &mem_objects.front(),
+//             flags,
+//             PYOPENCL_WAITLIST_ARGS, &evt
+//             ));
+//       );
+//     PYOPENCL_RETURN_NEW_EVENT(evt);
+//   }
+// #endif
+
+// #ifdef cl_ext_migrate_memobject
+//   inline
+//   event *enqueue_migrate_mem_object_ext(
+//       command_queue &cq,
+//       py::object py_mem_objects,
+//       cl_mem_migration_flags_ext flags,
+//       py::object py_wait_for)
+//   {
+//     PYOPENCL_PARSE_WAIT_FOR;
+
+// #if PYOPENCL_CL_VERSION >= 0x1020
+//     // {{{ get platform
+//     cl_device_id dev;
+//     PYOPENCL_CALL_GUARDED(clGetCommandQueueInfo, (cq.data(), CL_QUEUE_DEVICE,
+//           sizeof(dev), &dev, NULL));
+//     cl_platform_id plat;
+//     PYOPENCL_CALL_GUARDED(clGetDeviceInfo, (cq.data(), CL_DEVICE_PLATFORM,
+//           sizeof(plat), &plat, NULL));
+//     // }}}
+// #endif
+
+//     PYOPENCL_GET_EXT_FUN(plat,
+//         clEnqueueMigrateMemObjectEXT, enqueue_migrate_fn);
+
+//     std::vector<cl_mem> mem_objects;
+//     PYTHON_FOREACH(mo, py_mem_objects)
+//       mem_objects.push_back(py::extract<memory_object &>(mo)().data());
+
+//     cl_event evt;
+//     PYOPENCL_RETRY_IF_MEM_ERROR(
+//       PYOPENCL_CALL_GUARDED(enqueue_migrate_fn, (
+//             cq.data(),
+//             mem_objects.size(), mem_objects.empty( ) ? NULL : &mem_objects.front(),
+//             flags,
+//             PYOPENCL_WAITLIST_ARGS, &evt
+//             ));
+//       );
+//     PYOPENCL_RETURN_NEW_EVENT(evt);
+//   }
+// #endif
+
+  // }}}
+
+  
+  // {{{ buffer
+
+  inline cl_mem create_buffer(
+      cl_context ctx,
+      cl_mem_flags flags,
+      size_t size,
+      void *host_ptr)
+  {
+    cl_int status_code;
+    PYOPENCL_PRINT_CALL_TRACE("clCreateBuffer");
+    cl_mem mem = clCreateBuffer(ctx, flags, size, host_ptr, &status_code);
+
+    if (status_code != CL_SUCCESS)
+      throw pyopencl::error("create_buffer", status_code);
+
+    return mem;
+  }
+
+
+  
+  inline cl_mem create_buffer_gc(cl_context ctx,
+				 cl_mem_flags flags,
+				 size_t size,
+				 void *host_ptr)
+  {
+    // TODO
+    //PYOPENCL_RETRY_RETURN_IF_MEM_ERROR(
+    return create_buffer(ctx, flags, size, host_ptr);
+    // );
+  }
+
+  
+
+#if PYOPENCL_CL_VERSION >= 0x1010
+  inline cl_mem create_sub_buffer(
+      cl_mem buffer, cl_mem_flags flags, cl_buffer_create_type bct,
+      const void *buffer_create_info)
+  {
+    cl_int status_code;
+    PYOPENCL_PRINT_CALL_TRACE("clCreateSubBuffer");
+    cl_mem mem = clCreateSubBuffer(buffer, flags,
+        bct, buffer_create_info, &status_code);
+
+    if (status_code != CL_SUCCESS)
+      throw pyopencl::error("clCreateSubBuffer", status_code);
+
+    return mem;
+  }
+
+
+
+
+  inline cl_mem create_sub_buffer_gc(
+      cl_mem buffer, cl_mem_flags flags, cl_buffer_create_type bct,
+      const void *buffer_create_info)
+  {
+    // TODO
+    //PYOPENCL_RETRY_RETURN_IF_MEM_ERROR(
+    return create_sub_buffer(buffer, flags, bct, buffer_create_info);
+    //);
+  }
+#endif
+
+class buffer : public memory_object
+  {
+    public:
+      buffer(cl_mem mem, bool retain, void *hostbuf=0)
+        : memory_object(mem, retain, hostbuf)
+      { }
+
+// #if PYOPENCL_CL_VERSION >= 0x1010
+//       buffer *get_sub_region(
+//           size_t origin, size_t size, cl_mem_flags flags) const
+//       {
+//         cl_buffer_region region = { origin, size};
+
+//         cl_mem mem = create_sub_buffer_gc(
+//             data(), flags, CL_BUFFER_CREATE_TYPE_REGION, &region);
+
+//         try
+//         {
+//           return new buffer(mem, false);
+//         }
+//         catch (...)
+//         {
+//           PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem));
+//           throw;
+//         }
+//       }
+
+//       buffer *getitem(py::slice slc) const
+//       {
+//         PYOPENCL_BUFFER_SIZE_T start, end, stride, length;
+
+//         size_t my_length;
+//         PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
+//             (data(), CL_MEM_SIZE, sizeof(my_length), &my_length, 0));
+
+// #if PY_VERSION_HEX >= 0x03020000
+//         if (PySlice_GetIndicesEx(slc.ptr(),
+// #else
+//         if (PySlice_GetIndicesEx(reinterpret_cast<PySliceObject *>(slc.ptr()),
+// #endif
+//               my_length, &start, &end, &stride, &length) != 0)
+//           throw py::error_already_set();
+
+//         if (stride != 1)
+//           throw pyopencl::error("Buffer.__getitem__", CL_INVALID_VALUE,
+//               "Buffer slice must have stride 1");
+
+//         cl_mem_flags my_flags;
+//         PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
+//             (data(), CL_MEM_FLAGS, sizeof(my_flags), &my_flags, 0));
+
+//         return get_sub_region(start, end, my_flags);
+//       }
+// #endif
+  };
+
+  // {{{ buffer creation
+
+  inline
+  buffer *create_buffer_py(
+      context &ctx,
+      cl_mem_flags flags,
+      size_t size,
+      void* py_hostbuf
+      )
+  {
+    
+    void *buf = py_hostbuf;
+    void *retained_buf_obj = 0;
+    if (py_hostbuf != NULL)
+    {
+      if (flags & CL_MEM_USE_HOST_PTR)
+        retained_buf_obj = py_hostbuf;
+
+    }
+
+    cl_mem mem = create_buffer_gc(ctx.data(), flags, size, buf);
+
+    try
+    {
+      return new buffer(mem, false, retained_buf_obj);
+    }
+    catch (...)
+    {
+      PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem));
+      throw;
+    }
+  }
+
+  
+  // }}}
+
+  
+  // {{{ program
+
+  class program //: boost::noncopyable
+  {
+    public:
+      enum program_kind_type { KND_UNKNOWN, KND_SOURCE, KND_BINARY };
+
+    private:
+      cl_program m_program;
+      program_kind_type m_program_kind;
+
+    public:
+      program(cl_program prog, bool retain, program_kind_type progkind=KND_UNKNOWN)
+        : m_program(prog), m_program_kind(progkind)
+      {
+        if (retain)
+          PYOPENCL_CALL_GUARDED(clRetainProgram, (prog));
+      }
+
+      ~program()
+      {
+	// TODO
+        //PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseProgram, (m_program));
+      }
+
+      cl_program data() const
+      {
+        return m_program;
+      }
+
+      program_kind_type kind() const
+      {
+        return m_program_kind;
+      }
+
+    //PYOPENCL_EQUALITY_TESTS(program);
+
+    std::vector<cl_device_id> get_info__devices()
+    {
+      std::vector<cl_device_id> result;
+      PYOPENCL_GET_VEC_INFO(Program, m_program, CL_PROGRAM_DEVICES, result);
+      return result;
+    }
+
+    char** get_info__binaries(uint32_t *num_binaries) {
+      std::vector<size_t> sizes;
+      PYOPENCL_GET_VEC_INFO(Program, m_program, CL_PROGRAM_BINARY_SIZES, sizes);
+      
+      *num_binaries = sizes.size();
+      
+      MALLOC(char *, result_ptrs, sizes.size());
+      
+      for (unsigned i = 0; i < sizes.size(); ++i) {
+	result_ptrs[i] = new char[sizes[i]+1];
+	result_ptrs[i][sizes[i]] = '\0';
+      }
+      PYOPENCL_CALL_GUARDED(clGetProgramInfo,
+			    (m_program, CL_PROGRAM_BINARIES, sizes.size()*sizeof(char *),
+			     result_ptrs, 0)); \
+      return result_ptrs;
+//       py::list py_result;
+//       ptr = result.get();
+//       for (unsigned i = 0; i < sizes.size(); ++i) {
+// 	py::handle<> binary_pyobj(
+// #if PY_VERSION_HEX >= 0x03000000
+// 				  PyBytes_FromStringAndSize(
+// 							    reinterpret_cast<char *>(ptr), sizes[i])
+// #else
+// 				  PyString_FromStringAndSize(
+// 							     reinterpret_cast<char *>(ptr), sizes[i])
+// #endif
+// 				  );
+// 	py_result.append(binary_pyobj);
+// 	ptr += sizes[i];
+      }
+    
+
+      //py::object get_info(cl_program_info param_name) const
+//       {
+//         switch (param_name)
+//         {
+//           case CL_PROGRAM_REFERENCE_COUNT:
+//             PYOPENCL_GET_INTEGRAL_INFO(Program, m_program, param_name,
+//                 cl_uint);
+//           case CL_PROGRAM_CONTEXT:
+//             PYOPENCL_GET_OPAQUE_INFO(Program, m_program, param_name,
+//                 cl_context, context);
+//           case CL_PROGRAM_NUM_DEVICES:
+//             PYOPENCL_GET_INTEGRAL_INFO(Program, m_program, param_name,
+//                 cl_uint);
+//           case CL_PROGRAM_DEVICES:
+//             {
+//               std::vector<cl_device_id> result;
+//               PYOPENCL_GET_VEC_INFO(Program, m_program, param_name, result);
+
+//               py::list py_result;
+//               BOOST_FOREACH(cl_device_id did, result)
+//                 py_result.append(handle_from_new_ptr(
+//                       new pyopencl::device(did)));
+//               return py_result;
+//             }
+//           case CL_PROGRAM_SOURCE:
+//             PYOPENCL_GET_STR_INFO(Program, m_program, param_name);
+//           case CL_PROGRAM_BINARY_SIZES:
+//             {
+//               std::vector<size_t> result;
+//               PYOPENCL_GET_VEC_INFO(Program, m_program, param_name, result);
+//               PYOPENCL_RETURN_VECTOR(size_t, result);
+//             }
+//           case CL_PROGRAM_BINARIES:
+//             // {{{
+//             {
+//               std::vector<size_t> sizes;
+//               PYOPENCL_GET_VEC_INFO(Program, m_program, CL_PROGRAM_BINARY_SIZES, sizes);
+
+//               size_t total_size = std::accumulate(sizes.begin(), sizes.end(), 0);
+
+//               boost::scoped_array<unsigned char> result(
+//                   new unsigned char[total_size]);
+//               std::vector<unsigned char *> result_ptrs;
+
+//               unsigned char *ptr = result.get();
+//               for (unsigned i = 0; i < sizes.size(); ++i)
+//               {
+//                 result_ptrs.push_back(ptr);
+//                 ptr += sizes[i];
+//               }
+
+//               PYOPENCL_CALL_GUARDED(clGetProgramInfo,
+//                   (m_program, param_name, sizes.size()*sizeof(unsigned char *),
+//                    result_ptrs.empty( ) ? NULL : &result_ptrs.front(), 0)); 
+
+//               py::list py_result;
+//               ptr = result.get();
+//               for (unsigned i = 0; i < sizes.size(); ++i)
+//               {
+//                 py::handle<> binary_pyobj(
+// #if PY_VERSION_HEX >= 0x03000000
+//                     PyBytes_FromStringAndSize(
+//                       reinterpret_cast<char *>(ptr), sizes[i])
+// #else
+//                     PyString_FromStringAndSize(
+//                       reinterpret_cast<char *>(ptr), sizes[i])
+// #endif
+//                     );
+//                 py_result.append(binary_pyobj);
+//                 ptr += sizes[i];
+//               }
+//               return py_result;
+//             }
+//             // }}}
+// #if PYOPENCL_CL_VERSION >= 0x1020
+//           case CL_PROGRAM_NUM_KERNELS:
+//             PYOPENCL_GET_INTEGRAL_INFO(Program, m_program, param_name,
+//                 size_t);
+//           case CL_PROGRAM_KERNEL_NAMES:
+//             PYOPENCL_GET_STR_INFO(Program, m_program, param_name);
+// #endif
+
+//           default:
+//             throw error("Program.get_info", CL_INVALID_VALUE);
+//         }
+//       }
+
+//       py::object get_build_info(
+//           device const &dev,
+//           cl_program_build_info param_name) const
+//       {
+//         switch (param_name)
+//         {
+// #define PYOPENCL_FIRST_ARG m_program, dev.data() // hackety hack
+//           case CL_PROGRAM_BUILD_STATUS:
+//             PYOPENCL_GET_INTEGRAL_INFO(ProgramBuild,
+//                 PYOPENCL_FIRST_ARG, param_name,
+//                 cl_build_status);
+//           case CL_PROGRAM_BUILD_OPTIONS:
+//           case CL_PROGRAM_BUILD_LOG:
+//             PYOPENCL_GET_STR_INFO(ProgramBuild,
+//                 PYOPENCL_FIRST_ARG, param_name);
+// #if PYOPENCL_CL_VERSION >= 0x1020
+//           case CL_PROGRAM_BINARY_TYPE:
+//             PYOPENCL_GET_INTEGRAL_INFO(ProgramBuild,
+//                 PYOPENCL_FIRST_ARG, param_name,
+//                 cl_program_binary_type);
+// #endif
+// #undef PYOPENCL_FIRST_ARG
+
+//           default:
+//             throw error("Program.get_build_info", CL_INVALID_VALUE);
+//         }
+//       }
+
+      void build(char* options, cl_uint num_devices, void** ptr_devices)
+      { 
+	// todo: this function should get a list of device instances, not raw pointers
+	// pointers are for the cffi interface and should not be here
+        std::vector<cl_device_id> devices(num_devices);
+	for(cl_uint i = 0; i < num_devices; ++i) {
+	  devices[i] = static_cast<device*>(ptr_devices[i])->data();
+	}
+        PYOPENCL_CALL_GUARDED_THREADED(clBuildProgram,
+            (m_program, num_devices, devices.empty( ) ? NULL : &devices.front(),
+             options, 0 ,0));
+      }
+
+// #if PYOPENCL_CL_VERSION >= 0x1020
+//       void compile(std::string options, py::object py_devices,
+//           py::object py_headers)
+//       {
+//         PYOPENCL_PARSE_PY_DEVICES;
+
+//         // {{{ pick apart py_headers
+//         // py_headers is a list of tuples *(name, program)*
+
+//         std::vector<std::string> header_names;
+//         std::vector<cl_program> programs;
+//         PYTHON_FOREACH(name_hdr_tup, py_headers)
+//         {
+//           if (py::len(name_hdr_tup) != 2)
+//             throw error("Program.compile", CL_INVALID_VALUE,
+//                 "epxected (name, header) tuple in headers list");
+//           std::string name = py::extract<std::string const &>(name_hdr_tup[0]);
+//           program &prg = py::extract<program &>(name_hdr_tup[1]);
+
+//           header_names.push_back(name);
+//           programs.push_back(prg.data());
+//         }
+
+//         std::vector<const char *> header_name_ptrs;
+//         BOOST_FOREACH(std::string const &name, header_names)
+//           header_name_ptrs.push_back(name.c_str());
+
+//         // }}}
+
+//         PYOPENCL_CALL_GUARDED_THREADED(clCompileProgram,
+//             (m_program, num_devices, devices,
+//              options.c_str(), header_names.size(),
+//              programs.empty() ? NULL : &programs.front(),
+//              header_name_ptrs.empty() ? NULL : &header_name_ptrs.front(),
+//              0, 0));
+//       }
+// #endif
+  };
+
+
+
+  // {{{ kernel
+  class local_memory
+  {
+    private:
+      size_t m_size;
+
+    public:
+      local_memory(size_t size)
+        : m_size(size)
+      { }
+
+      size_t size() const
+      { return m_size; }
+  };
+
+
+
+
+  class kernel // : boost::noncopyable
+  {
+    private:
+      cl_kernel m_kernel;
+
+    public:
+      kernel(cl_kernel knl, bool retain)
+        : m_kernel(knl)
+      {
+        if (retain)
+          PYOPENCL_CALL_GUARDED(clRetainKernel, (knl));
+      }
+
+      kernel(program const &prg, std::string const &kernel_name)
+      {
+        cl_int status_code;
+
+        PYOPENCL_PRINT_CALL_TRACE("clCreateKernel");
+        m_kernel = clCreateKernel(prg.data(), kernel_name.c_str(),
+            &status_code);
+        if (status_code != CL_SUCCESS)
+          throw pyopencl::error("clCreateKernel", status_code);
+      }
+
+      ~kernel()
+      {
+	// todo
+        //PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseKernel, (m_kernel));
+      }
+
+      cl_kernel data() const
+      {
+        return m_kernel;
+      }
+
+      PYOPENCL_EQUALITY_TESTS(kernel);
+
+      void set_arg_null(cl_uint arg_index)
+      {
+        cl_mem m = 0;
+        PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index,
+              sizeof(cl_mem), &m));
+      }
+
+      void set_arg_mem(cl_uint arg_index, memory_object_holder &moh)
+      {
+        cl_mem m = moh.data();
+        PYOPENCL_CALL_GUARDED(clSetKernelArg,
+            (m_kernel, arg_index, sizeof(cl_mem), &m));
+      }
+
+      void set_arg_local(cl_uint arg_index, local_memory const &loc)
+      {
+        PYOPENCL_CALL_GUARDED(clSetKernelArg,
+            (m_kernel, arg_index, loc.size(), 0));
+      }
+
+      // void set_arg_sampler(cl_uint arg_index, sampler const &smp)
+      // {
+      //   cl_sampler s = smp.data();
+      //   PYOPENCL_CALL_GUARDED(clSetKernelArg,
+      //       (m_kernel, arg_index, sizeof(cl_sampler), &s));
+      // }
+
+      // void set_arg_buf(cl_uint arg_index, py::object py_buffer)
+      // {
+      //   const void *buf;
+      //   PYOPENCL_BUFFER_SIZE_T len;
+
+      //   if (PyObject_AsReadBuffer(py_buffer.ptr(), &buf, &len))
+      //   {
+      //     PyErr_Clear();
+      //     throw error("Kernel.set_arg", CL_INVALID_VALUE,
+      //         "invalid kernel argument");
+      //   }
+
+      //   PYOPENCL_CALL_GUARDED(clSetKernelArg,
+      //       (m_kernel, arg_index, len, buf));
+      // }
+
+      // void set_arg(cl_uint arg_index, py::object arg)
+      // {
+      //   if (arg.ptr() == Py_None)
+      //   {
+      //     set_arg_null(arg_index);
+      //     return;
+      //   }
+
+      //   py::extract<memory_object_holder &> ex_mo(arg);
+      //   if (ex_mo.check())
+      //   {
+      //     set_arg_mem(arg_index, ex_mo());
+      //     return;
+      //   }
+
+      //   py::extract<local_memory const &> ex_loc(arg);
+      //   if (ex_loc.check())
+      //   {
+      //     set_arg_local(arg_index, ex_loc());
+      //     return;
+      //   }
+
+      //   py::extract<sampler const &> ex_smp(arg);
+      //   if (ex_smp.check())
+      //   {
+      //     set_arg_sampler(arg_index, ex_smp());
+      //     return;
+      //   }
+
+      //   set_arg_buf(arg_index, arg);
+      // }
+    
+       generic_info get_info(cl_kernel_info param_name) const
+       {
+         switch (param_name)
+	   {
+//           case CL_KERNEL_FUNCTION_NAME:
+//             PYOPENCL_GET_STR_INFO(Kernel, m_kernel, param_name);
+           case CL_KERNEL_NUM_ARGS:
+           case CL_KERNEL_REFERENCE_COUNT:
+             PYOPENCL_GET_INTEGRAL_INFO(Kernel, m_kernel, param_name,
+                 cl_uint);
+//           case CL_KERNEL_CONTEXT:
+//             PYOPENCL_GET_OPAQUE_INFO(Kernel, m_kernel, param_name,
+//                 cl_context, context);
+//           case CL_KERNEL_PROGRAM:
+//             PYOPENCL_GET_OPAQUE_INFO(Kernel, m_kernel, param_name,
+//                 cl_program, program);
+// #if PYOPENCL_CL_VERSION >= 0x1020
+//           case CL_KERNEL_ATTRIBUTES:
+//             PYOPENCL_GET_STR_INFO(Kernel, m_kernel, param_name);
+// #endif
+          default:
+            throw error("Kernel.get_info", CL_INVALID_VALUE);
+	   }
+       }
+
+//       py::object get_work_group_info(
+//           cl_kernel_work_group_info param_name,
+//           device const &dev
+//           ) const
+//       {
+//         switch (param_name)
+//         {
+// #define PYOPENCL_FIRST_ARG m_kernel, dev.data() // hackety hack
+//           case CL_KERNEL_WORK_GROUP_SIZE:
+//             PYOPENCL_GET_INTEGRAL_INFO(KernelWorkGroup,
+//                 PYOPENCL_FIRST_ARG, param_name,
+//                 size_t);
+//           case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
+//             {
+//               std::vector<size_t> result;
+//               PYOPENCL_GET_VEC_INFO(KernelWorkGroup,
+//                   PYOPENCL_FIRST_ARG, param_name, result);
+
+//               PYOPENCL_RETURN_VECTOR(size_t, result);
+//             }
+//           case CL_KERNEL_LOCAL_MEM_SIZE:
+// #if PYOPENCL_CL_VERSION >= 0x1010
+//           case CL_KERNEL_PRIVATE_MEM_SIZE:
+// #endif
+//             PYOPENCL_GET_INTEGRAL_INFO(KernelWorkGroup,
+//                 PYOPENCL_FIRST_ARG, param_name,
+//                 cl_ulong);
+
+// #if PYOPENCL_CL_VERSION >= 0x1010
+//           case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
+//             PYOPENCL_GET_INTEGRAL_INFO(KernelWorkGroup,
+//                 PYOPENCL_FIRST_ARG, param_name,
+//                 size_t);
+// #endif
+//           default:
+//             throw error("Kernel.get_work_group_info", CL_INVALID_VALUE);
+// #undef PYOPENCL_FIRST_ARG
+//         }
+//       }
+
+// #if PYOPENCL_CL_VERSION >= 0x1020
+//       py::object get_arg_info(
+//           cl_uint arg_index,
+//           cl_kernel_arg_info param_name
+//           ) const
+//       {
+//         switch (param_name)
+//         {
+// #define PYOPENCL_FIRST_ARG m_kernel, arg_index // hackety hack
+//           case CL_KERNEL_ARG_ADDRESS_QUALIFIER:
+//             PYOPENCL_GET_INTEGRAL_INFO(KernelArg,
+//                 PYOPENCL_FIRST_ARG, param_name,
+//                 cl_kernel_arg_address_qualifier);
+
+//           case CL_KERNEL_ARG_ACCESS_QUALIFIER:
+//             PYOPENCL_GET_INTEGRAL_INFO(KernelArg,
+//                 PYOPENCL_FIRST_ARG, param_name,
+//                 cl_kernel_arg_access_qualifier);
+
+//           case CL_KERNEL_ARG_TYPE_NAME:
+//           case CL_KERNEL_ARG_NAME:
+//             PYOPENCL_GET_STR_INFO(KernelArg, PYOPENCL_FIRST_ARG, param_name);
+// #undef PYOPENCL_FIRST_ARG
+//           default:
+//             throw error("Kernel.get_arg_info", CL_INVALID_VALUE);
+//         }
+//       }
+// #endif
+  };
+
+ 
+  // {{{ buffer transfers
+
+  inline
+  event *enqueue_read_buffer(
+      command_queue &cq,
+      memory_object_holder &mem,
+      void* buffer,
+      size_t size, 
+      size_t device_offset,
+      /*py::object py_wait_for,*/
+      bool is_blocking)
+  {
+    // TODO
+    //PYOPENCL_PARSE_WAIT_FOR;
+
+    cl_event evt;
+    // TODO
+    //PYOPENCL_RETRY_IF_MEM_ERROR(
+      PYOPENCL_CALL_GUARDED_THREADED(clEnqueueReadBuffer,
+				     (cq.data(),
+				      mem.data(),
+				      PYOPENCL_CAST_BOOL(is_blocking),
+				      device_offset, size, buffer,
+				      0, NULL,
+				      //PYOPENCL_WAITLIST_ARGS,
+				      &evt
+				      ));
+	//);
+    PYOPENCL_RETURN_NEW_EVENT(evt);
+  }
+
+
+
+  inline
+  event *enqueue_copy_buffer(command_queue &cq,
+			     memory_object_holder &src,
+			     memory_object_holder &dst,
+			     ptrdiff_t byte_count,
+			     size_t src_offset,
+			     size_t dst_offset
+			     // ,
+			     /*py::object py_wait_for*/
+			     )
+  {
+    // TODO
+    // PYOPENCL_PARSE_WAIT_FOR;
+
+    if (byte_count < 0)
+      {
+	size_t byte_count_src = 0;
+	size_t byte_count_dst = 0;
+	PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
+			      (src.data(), CL_MEM_SIZE, sizeof(byte_count), &byte_count_src, 0));
+	PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
+			      (src.data(), CL_MEM_SIZE, sizeof(byte_count), &byte_count_dst, 0));
+	byte_count = std::min(byte_count_src, byte_count_dst);
+      }
+
+    cl_event evt;
+    // TODO
+    //PYOPENCL_RETRY_IF_MEM_ERROR(
+    PYOPENCL_CALL_GUARDED(clEnqueueCopyBuffer, (cq.data(),
+						src.data(), dst.data(),
+						src_offset, dst_offset,
+						byte_count,
+						0, NULL, //PYOPENCL_WAITLIST_ARGS,
+						&evt
+						))
+				  // );
+
+    PYOPENCL_RETURN_NEW_EVENT(evt);
+  }
+
+  // }}}
+
+inline event *enqueue_nd_range_kernel(
+      command_queue &cq,
+      kernel &knl,
+      cl_uint work_dim,
+      const size_t* global_work_offset,
+      const size_t* global_work_size,
+      const size_t* local_work_size //,
+      //py::object py_global_work_offset,
+      //py::object py_wait_for,
+				      )
+  {
+    // TODO
+    // PYOPENCL_PARSE_WAIT_FOR;
+
+    cl_event evt;
+    
+    // TODO: PYOPENCL_RETRY_RETURN_IF_MEM_ERROR
+    PYOPENCL_CALL_GUARDED(clEnqueueNDRangeKernel,
+			  (cq.data(),
+			   knl.data(),
+			   work_dim,
+			   global_work_offset,
+			   global_work_size,
+			   local_work_size,
+			   0, NULL,// PYOPENCL_WAITLIST_ARGS,
+			   &evt
+			   ));
+    PYOPENCL_RETURN_NEW_EVENT(evt);
+    
+  }
+
+
+
+  // }}}
+  inline
+  program *create_program_with_source(
+      context &ctx,
+      std::string const &src)
+  {
+    const char *string = src.c_str();
+    size_t length = src.size();
+
+    cl_int status_code;
+    PYOPENCL_PRINT_CALL_TRACE("clCreateProgramWithSource");
+    cl_program result = clCreateProgramWithSource(
+        ctx.data(), 1, &string, &length, &status_code);
+    if (status_code != CL_SUCCESS)
+      throw pyopencl::error("clCreateProgramWithSource", status_code);
+
+    try
+    {
+      return new program(result, false, program::KND_SOURCE);
+    }
+    catch (...)
+    {
+      clReleaseProgram(result);
+      throw;
+    }
+  }
+
+  inline
+  program *create_program_with_binary(
+      context &ctx,
+      cl_uint num_devices, 
+      void** ptr_devices,
+      cl_uint num_binaries,
+      char** binaries)
+  {
+    std::vector<cl_device_id> devices;
+    std::vector<size_t> sizes;
+    std::vector<cl_int> binary_statuses;
+
+    for (cl_uint i = 0; i < num_devices; ++i)
+    {
+      devices.push_back(static_cast<device*>(ptr_devices[i])->data());
+      sizes.push_back(strlen(const_cast<const char*>(binaries[i])));
+    }
+    binary_statuses.resize(num_devices);
+    cl_int status_code;
+    PYOPENCL_PRINT_CALL_TRACE("clCreateProgramWithBinary");
+    cl_program result = clCreateProgramWithBinary(
+        ctx.data(), num_devices,
+        devices.empty( ) ? NULL : &devices.front(),
+        sizes.empty( ) ? NULL : &sizes.front(),
+        reinterpret_cast<const unsigned char**>(const_cast<const char**>(binaries)), // todo: valid cast?
+        binary_statuses.empty( ) ? NULL : &binary_statuses.front(),
+        &status_code);
+    if (status_code != CL_SUCCESS)
+      throw pyopencl::error("clCreateProgramWithBinary", status_code);
+    
+    // for (cl_uint i = 0; i < num_devices; ++i)
+    //   std::cout << i << ":" << binary_statuses[i] << std::endl;
+
+    try
+    {
+      return new program(result, false, program::KND_BINARY);
+    }
+    catch (...)
+    {
+      clReleaseProgram(result);
+      throw;
+    }
+  }
+
+
+
+
+  
+
+  void* get_platforms(void** ptr_platforms, uint32_t *num_platforms) {
+    *num_platforms = 0;
+    PYOPENCL_CALL_GUARDED(clGetPlatformIDs, (0, 0, num_platforms));
+
+    typedef std::vector<cl_platform_id> vec;
+    vec platforms(*num_platforms);
+    PYOPENCL_CALL_GUARDED(clGetPlatformIDs,
+			  (*num_platforms, platforms.empty( ) ? NULL : &platforms.front(), num_platforms));
+
+    MALLOC(platform*, _ptr_platforms, *num_platforms);
+    for(vec::size_type i = 0; i < platforms.size(); ++i) {
+      _ptr_platforms[i] = new platform(platforms[i]);
+    }
+    *ptr_platforms = _ptr_platforms;
+    return 0;
+  }
+
+  void freem(void* p) {
+    free(p);
+  }
+
+  void* platform__get_info(void* ptr_platform, cl_platform_info param_name, char** out) {
+    // todo: catch error
+    *out = _copy_str(static_cast<platform*>(ptr_platform)->get_info(param_name));
+    return 0;
+  }
+
+  void* platform__get_devices(void* ptr_platform, void** ptr_devices, uint32_t* num_devices, cl_device_type devtype) {
+    typedef std::vector<cl_device_id> vec;
+    
+    // todo: catch error
+    vec devices = static_cast<platform*>(ptr_platform)->get_devices(devtype);
+    *num_devices = devices.size();
+
+    MALLOC(device*, _ptr_devices, *num_devices);
+    for(vec::size_type i = 0; i < devices.size(); ++i) {
+      _ptr_devices[i] = new device(devices[i]);
+    }
+    *ptr_devices = _ptr_devices;
+    
+    return 0;
+  }
+
+  void* device__get_info(void* ptr_device, cl_device_info param_name, char** out) {
+    // todo: catch error
+    *out = _copy_str(static_cast<device*>(ptr_device)->get_info(param_name));
+    return 0;
+  }
+
+  void* _create_context(void** ptr_ctx, cl_context_properties* properties, cl_uint num_devices, void** ptr_devices) {
+  
+    cl_int status_code;
+    std::vector<cl_device_id> devices(num_devices);
+    for(cl_uint i = 0; i < num_devices; ++i) {
+      devices[i] = static_cast<device*>(ptr_devices[i])->data();
+    }
+    cl_context ctx = clCreateContext(properties,
+				     num_devices,
+				     devices.empty() ? NULL : &devices.front(),
+				     0, 0, &status_code);
+    if (status_code != CL_SUCCESS) {
+      std::cout << status_code << std::endl;
+      // TODO error handling
+      //throw pyopencl::error("Context", status_code);
+    }
+    *ptr_ctx = new context(ctx, false);
+    
+    return 0;
+  }
+
+  void* _create_command_queue(void** ptr_command_queue, void* ptr_context, void* ptr_device, cl_command_queue_properties properties) {
+    // todo error handling
+    context* ctx = static_cast<context*>(ptr_context);
+    device* dev = static_cast<device*>(ptr_device);
+    *ptr_command_queue = new command_queue(*ctx, dev, properties);
+    return 0;
+  }
+
+  void* _create_buffer(void** ptr_buffer, void* ptr_context, cl_mem_flags flags, size_t size, void* hostbuf) {
+    context* ctx = static_cast<context*>(ptr_context);
+    *ptr_buffer = create_buffer_py(*ctx, flags, size, hostbuf);
+    // todo error handling
+    return 0;
+  }
+
+  void* _create_program_with_source(void **ptr_program, void *ptr_context, char* src) {
+    context* ctx = static_cast<context*>(ptr_context);
+    *ptr_program = create_program_with_source(*ctx, src);
+    // todo error handling
+    return 0;
+  }
+
+  void* _create_program_with_binary(void **ptr_program, void *ptr_context, cl_uint num_devices, void** ptr_devices, cl_uint num_binaries, char** binaries) {
+    // todo: catch error
+    context* ctx = static_cast<context*>(ptr_context);
+    *ptr_program = create_program_with_binary(*ctx, num_devices, ptr_devices, num_binaries, binaries);
+    return 0;
+  }
+
+  void* program__build(void* ptr_program, char* options, cl_uint num_devices, void** ptr_devices) {
+    // todo: catch error
+    static_cast<program*>(ptr_program)->build(options, num_devices, ptr_devices);
+    return 0;
+  }
+
+  void* program__kind(void* ptr_program, int *kind) {
+    // todo: catch error
+    *kind = static_cast<program*>(ptr_program)->kind();
+    return 0;
+  }
+
+  void* program__get_info__devices(void* ptr_program, void** ptr_devices, uint32_t* num_devices) {
+
+    typedef std::vector<cl_device_id> vec;
+
+    // todo: refactor, same as get_devices()
+    
+    // todo: catch error
+    vec devices = static_cast<program*>(ptr_program)->get_info__devices();
+    *num_devices = devices.size();
+
+    MALLOC(device*, _ptr_devices, *num_devices);
+    for(vec::size_type i = 0; i < devices.size(); ++i) {
+      _ptr_devices[i] = new device(devices[i]);
+    }
+    *ptr_devices = _ptr_devices;
+    
+    return 0;
+    
+  }
+
+  void* program__get_info__binaries(void* ptr_program, char*** ptr_binaries, uint32_t* num_binaries) {
+    // todo catch error
+    *ptr_binaries = static_cast<program*>(ptr_program)->get_info__binaries(num_binaries);
+    return 0;
+  }
+
+  long device__hash(void *ptr_device) {
+    return static_cast<device*>(ptr_device)->hash();
+  }
+
+  void* _create_kernel(void** ptr_kernel, void* ptr_program, char* name) {
+    program* prg = static_cast<program*>(ptr_program);
+    *ptr_kernel = new kernel(*prg, name);
+    // todo error handling
+    return 0;
+  }
+
+  void* kernel__get_info(void* ptr_kernel, cl_kernel_info param, generic_info* out) {
+    *out = static_cast<kernel*>(ptr_kernel)->get_info(param);
+    // todo error handling
+    return 0;
+  }
+
+  void* kernel__set_arg_mem_buffer(void* ptr_kernel, cl_uint arg_index, void* ptr_buffer) {
+    buffer* buf = static_cast<buffer*>(ptr_buffer);
+    static_cast<kernel*>(ptr_kernel)->set_arg_mem(arg_index, *buf);
+    // todo error handling
+    return 0;
+  }
+
+  void* _enqueue_nd_range_kernel(void **ptr_event, void* ptr_command_queue, void* ptr_kernel, cl_uint work_dim, const size_t* global_work_offset, const size_t* global_work_size, const size_t* local_work_size) {
+    *ptr_event = enqueue_nd_range_kernel(*static_cast<command_queue*>(ptr_command_queue),
+					 *static_cast<kernel*>(ptr_kernel),
+					 work_dim,
+					 global_work_offset,
+					 global_work_size,
+					 local_work_size);
+    // todo error handling
+    
+    return 0;
+  }
+
+
+  void* _enqueue_read_buffer(void **ptr_event, void* ptr_command_queue, void* ptr_memory_object_holder, void* buffer, size_t size, size_t device_offset, int is_blocking) {
+    *ptr_event = enqueue_read_buffer(*static_cast<command_queue*>(ptr_command_queue),
+				     *static_cast<memory_object_holder*>(ptr_memory_object_holder),
+				     buffer, size, device_offset, (bool)is_blocking);
+    // todo error handling
+    return 0;
+  }
+  
+  void* memory_object_holder__get_info(void* ptr_memory_object_holder, cl_mem_info param, generic_info* out) {
+    *out = static_cast<memory_object_holder*>(ptr_memory_object_holder)->get_info(param);
+    // todo error handling
+    return 0;
+  }
+
+
+}
+
+
+
+
diff --git a/src/c_wrapper/wrap_cl.h b/src/c_wrapper/wrap_cl.h
new file mode 100644
index 0000000000000000000000000000000000000000..3abd74cfd45f8ee586df99e0ecf6e37f0cb19dda
--- /dev/null
+++ b/src/c_wrapper/wrap_cl.h
@@ -0,0 +1,73 @@
+#ifndef _WRAP_CL_H
+#define _WRAP_CL_H
+
+
+// CL 1.2 undecided:
+// clSetPrintfCallback
+
+// {{{ includes
+
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+
+#ifdef __APPLE__
+
+// Mac ------------------------------------------------------------------------
+#include <OpenCL/opencl.h>
+#ifdef HAVE_GL
+
+#define PYOPENCL_GL_SHARING_VERSION 1
+
+#include <OpenGL/OpenGL.h>
+#include <OpenCL/cl_gl.h>
+#include <OpenCL/cl_gl_ext.h>
+#endif
+
+#else
+
+// elsewhere ------------------------------------------------------------------
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+
+#if defined(_WIN32)
+#define NOMINMAX
+#include <windows.h>
+#endif
+
+#ifdef HAVE_GL
+#include <GL/gl.h>
+#include <CL/cl_gl.h>
+#endif
+
+#if defined(cl_khr_gl_sharing) && (cl_khr_gl_sharing >= 1)
+#define PYOPENCL_GL_SHARING_VERSION cl_khr_gl_sharing
+#endif
+
+#endif
+
+
+#ifdef PYOPENCL_PRETEND_CL_VERSION
+#define PYOPENCL_CL_VERSION PYOPENCL_PRETEND_CL_VERSION
+#else
+
+#if defined(CL_VERSION_1_2)
+#define PYOPENCL_CL_VERSION 0x1020
+#elif defined(CL_VERSION_1_1)
+#define PYOPENCL_CL_VERSION 0x1010
+#else
+#define PYOPENCL_CL_VERSION 0x1000
+#endif
+
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  #include "wrap_cl_core.h"
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+
diff --git a/src/c_wrapper/wrap_cl_core.h b/src/c_wrapper/wrap_cl_core.h
new file mode 100644
index 0000000000000000000000000000000000000000..f25b2b4b2e82beef46c14e0ae9bbe346a59df8c2
--- /dev/null
+++ b/src/c_wrapper/wrap_cl_core.h
@@ -0,0 +1,34 @@
+  typedef enum {
+    generic_info_type_cl_uint,
+    generic_info_type_cl_mem_object_type,
+  } generic_info_type_t;
+  typedef struct {
+    generic_info_type_t type;
+    union value_t {
+      cl_uint _cl_uint;
+      cl_mem_object_type _cl_mem_object_type;
+    } value;
+  } generic_info;
+
+  int get_cl_version(void);
+  void* get_platforms(void** ptr_platforms, uint32_t* num_platforms);
+  void* platform__get_info(void* ptr_platform, cl_platform_info param_name, char** out);
+  void* platform__get_devices(void* ptr_platform, void** ptr_devices, uint32_t* num_devices, cl_device_type devtype);
+  void* device__get_info(void* ptr_device, cl_device_info param_name, char** out);
+  long device__hash(void *ptr_device);
+  void* _create_context(void** ptr_ctx, cl_context_properties* properties, cl_uint num_devices, void** ptr_devices);
+  void* _create_command_queue(void** ptr_command_queue, void* ptr_context, void* ptr_device, cl_command_queue_properties properties);
+  void* _create_buffer(void** ptr_buffer, void* ptr_context, cl_mem_flags flags, size_t size, void* hostbuf);
+  void* _create_program_with_source(void **ptr_program, void *ptr_context, char* src);
+  void* _create_program_with_binary(void **ptr_program, void *ptr_context, cl_uint num_devices, void** ptr_devices, cl_uint num_binaries, char** binaries);
+  void* program__build(void* ptr_program, char* options, cl_uint num_devices, void** ptr_devices);
+  void* program__kind(void* ptr_program, int *kind);
+  void* program__get_info__devices(void* ptr_program, void** ptr_devices, uint32_t* num_devices);
+  void* program__get_info__binaries(void* ptr_program, char*** ptr_binaries, uint32_t* num_binaries);
+  void* _create_kernel(void** ptr_kernel, void* ptr_program, char* name);
+  void* kernel__get_info(void *ptr_kernel, cl_kernel_info param, generic_info* out);
+  void* kernel__set_arg_mem_buffer(void* ptr_kernel, cl_uint arg_index, void* ptr_buffer);
+  void* _enqueue_nd_range_kernel(void **ptr_event, void* ptr_command_queue, void* ptr_kernel, cl_uint work_dim, const size_t* global_work_offset, const size_t* global_work_size, const size_t* local_work_size);
+  void* _enqueue_read_buffer(void **ptr_event, void* ptr_command_queue, void* ptr_memory_object_holder, void* buffer, size_t size, size_t device_offset, int is_blocking);
+  void* memory_object_holder__get_info(void* ptr_memory_object_holder, cl_mem_info param, generic_info* out);
+  void freem(void*);