// PyOpenCL-flavored C++ wrapper of the CL API
//
// Copyright (C) 2009 Andreas Kloeckner
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.


#ifndef _AFJHAYYTA_PYOPENCL_HEADER_SEEN_WRAP_CL_HPP
#define _AFJHAYYTA_PYOPENCL_HEADER_SEEN_WRAP_CL_HPP

// CL 1.2 undecided:
// clSetPrintfCallback

// CL 2.0 complete

// CL 2.1 complete

// CL 2.2 complete

// CL 3.0 missing:
// clCreateBufferWithProperties
// clCreateImageWithProperties
// (no wrappers for now: OpenCL 3.0 does not define any optional properties for
// buffers or images, no implementations to test with.)


// {{{ includes

#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
// #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION

#ifdef __APPLE__

// Mac ------------------------------------------------------------------------
#include <OpenCL/opencl.h>
#include "pyopencl_ext.h"
#ifdef HAVE_GL

#define PYOPENCL_GL_SHARING_VERSION 1

#include <OpenGL/OpenGL.h>
#include <OpenCL/cl_gl.h>
#include <OpenCL/cl_gl_ext.h>
#endif

#else

// elsewhere ------------------------------------------------------------------
#define CL_TARGET_OPENCL_VERSION 300

#include <CL/cl.h>
#include "pyopencl_ext.h"

#if defined(_WIN32)
#define NOMINMAX
#include <windows.h>
#endif

#ifdef HAVE_GL
#include <GL/gl.h>
#include <CL/cl_gl.h>
#endif

#if defined(cl_khr_gl_sharing) && (cl_khr_gl_sharing >= 1)
#define PYOPENCL_GL_SHARING_VERSION cl_khr_gl_sharing
#endif

#endif

#include <functional>
#include <thread>
#include <mutex>
#include <condition_variable>

#include <cstdio>
#include <stdexcept>
#include <iostream>
#include <vector>
#include <utility>
#include <array>
#include <numeric>
#include "wrap_helpers.hpp"
#include <numpy/arrayobject.h>
#include "tools.hpp"

#ifdef PYOPENCL_PRETEND_CL_VERSION
#define PYOPENCL_CL_VERSION PYOPENCL_PRETEND_CL_VERSION
#else

#if defined(CL_VERSION_3_0)
#define PYOPENCL_CL_VERSION 0x3000
#elif defined(CL_VERSION_2_2)
#define PYOPENCL_CL_VERSION 0x2020
#elif defined(CL_VERSION_2_1)
#define PYOPENCL_CL_VERSION 0x2010
#elif defined(CL_VERSION_2_0)
#define PYOPENCL_CL_VERSION 0x2000
#elif defined(CL_VERSION_1_2)
#define PYOPENCL_CL_VERSION 0x1020
#elif defined(CL_VERSION_1_1)
#define PYOPENCL_CL_VERSION 0x1010
#else
#define PYOPENCL_CL_VERSION 0x1000
#endif

#endif


#if defined(_WIN32)
// MSVC does not understand variable-length arrays
#define PYOPENCL_STACK_CONTAINER(TYPE, NAME, COUNT) std::vector<TYPE> NAME(COUNT)
#define PYOPENCL_STACK_CONTAINER_GET_PTR(NAME) (NAME.size() ? NAME.data() : nullptr)
#else
// gcc et al complain about stripping attributes in template arguments
#define PYOPENCL_STACK_CONTAINER(TYPE, NAME, COUNT) TYPE NAME[COUNT]
#define PYOPENCL_STACK_CONTAINER_GET_PTR(NAME) NAME
#endif

// }}}


// {{{ macros and typedefs for wrappers

#if NPY_ABI_VERSION < 0x02000000
  #define PyDataType_ELSIZE(descr) ((descr)->elsize)
#endif

#if PY_VERSION_HEX >= 0x02050000
  typedef Py_ssize_t PYOPENCL_BUFFER_SIZE_T;
#else
  typedef int PYOPENCL_BUFFER_SIZE_T;
#endif

#define PYOPENCL_CAST_BOOL(B) ((B) ? CL_TRUE : CL_FALSE)





#define PYOPENCL_DEPRECATED(WHAT, KILL_VERSION, EXTRA_MSG) \
  { \
    PyErr_Warn( \
        PyExc_DeprecationWarning, \
        WHAT " is deprecated and will stop working in PyOpenCL " KILL_VERSION". " \
        EXTRA_MSG); \
  }

#if PYOPENCL_CL_VERSION >= 0x1020

#define PYOPENCL_GET_EXT_FUN(PLATFORM, NAME, VAR) \
    NAME##_fn VAR \
      = (NAME##_fn) \
      clGetExtensionFunctionAddressForPlatform(PLATFORM, #NAME); \
    \
    if (!VAR) \
      throw error(#NAME, CL_INVALID_VALUE, #NAME \
          "not available");

#else

#define PYOPENCL_GET_EXT_FUN(PLATFORM, NAME, VAR) \
    NAME##_fn VAR \
      = (NAME##_fn) \
      clGetExtensionFunctionAddress(#NAME); \
    \
    if (!VAR) \
      throw error(#NAME, CL_INVALID_VALUE, #NAME \
          "not available");

#endif


#define PYOPENCL_PARSE_PY_DEVICES \
    std::vector<cl_device_id> devices_vec; \
    cl_uint num_devices; \
    cl_device_id *devices; \
    \
    if (py_devices.ptr() == Py_None) \
    { \
      num_devices = 0; \
      devices = 0; \
    } \
    else \
    { \
      for (py::handle py_dev: py_devices) \
        devices_vec.push_back( \
            py::cast<device &>(py_dev).data()); \
      num_devices = devices_vec.size(); \
      devices = devices_vec.empty( ) ? nullptr : &devices_vec.front(); \
    } \


#define PYOPENCL_RETRY_RETURN_IF_MEM_ERROR(OPERATION) \
    try \
    { \
      OPERATION \
    } \
    catch (pyopencl::error &e) \
    { \
      if (!e.is_out_of_memory()) \
        throw; \
    } \
    \
    /* If we get here, we got an error from CL.
     * We should run the Python GC to try and free up
     * some memory references. */ \
    run_python_gc(); \
    \
    /* Now retry the allocation. If it fails again,
     * let it fail. */ \
    { \
      OPERATION \
    }


#define PYOPENCL_RETRY_IF_MEM_ERROR(OPERATION) \
  { \
    bool failed_with_mem_error = false; \
    try \
    { \
      OPERATION \
    } \
    catch (pyopencl::error &e) \
    { \
      failed_with_mem_error = true; \
      if (!e.is_out_of_memory()) \
        throw; \
    } \
    \
    if (failed_with_mem_error) \
    { \
      /* If we get here, we got an error from CL.
       * We should run the Python GC to try and free up
       * some memory references. */ \
      run_python_gc(); \
      \
      /* Now retry the allocation. If it fails again,
       * let it fail. */ \
      { \
        OPERATION \
      } \
    } \
  }


#define PYOPENCL_GET_SVM_SIZE(NAME) \
  size_t NAME##_size; \
  bool NAME##_has_size = false; \
  try \
  { \
    NAME##_size = NAME.size(); \
    NAME##_has_size = true; \
  } \
  catch (size_not_available)  { }

// }}}


// {{{ tracing and error reporting
#ifdef PYOPENCL_TRACE
  #define PYOPENCL_PRINT_CALL_TRACE(NAME) \
    std::cerr << NAME << std::endl;
  #define PYOPENCL_PRINT_CALL_TRACE_INFO(NAME, EXTRA_INFO) \
    std::cerr << NAME << " (" << EXTRA_INFO << ')' << std::endl;
#else
  #define PYOPENCL_PRINT_CALL_TRACE(NAME) /*nothing*/
  #define PYOPENCL_PRINT_CALL_TRACE_INFO(NAME, EXTRA_INFO) /*nothing*/
#endif

#define PYOPENCL_CALL_GUARDED_THREADED_WITH_TRACE_INFO(NAME, ARGLIST, TRACE_INFO) \
  { \
    PYOPENCL_PRINT_CALL_TRACE_INFO(#NAME, TRACE_INFO); \
    cl_int status_code; \
    { \
      py::gil_scoped_release release; \
      status_code = NAME ARGLIST; \
    } \
    if (status_code != CL_SUCCESS) \
      throw pyopencl::error(#NAME, status_code);\
  }

#define PYOPENCL_CALL_GUARDED_WITH_TRACE_INFO(NAME, ARGLIST, TRACE_INFO) \
  { \
    PYOPENCL_PRINT_CALL_TRACE_INFO(#NAME, TRACE_INFO); \
    cl_int status_code; \
    status_code = NAME ARGLIST; \
    if (status_code != CL_SUCCESS) \
      throw pyopencl::error(#NAME, status_code);\
  }

#define PYOPENCL_CALL_GUARDED_THREADED(NAME, ARGLIST) \
  { \
    PYOPENCL_PRINT_CALL_TRACE(#NAME); \
    cl_int status_code; \
    { \
      py::gil_scoped_release release; \
      status_code = NAME ARGLIST; \
    } \
    if (status_code != CL_SUCCESS) \
      throw pyopencl::error(#NAME, status_code);\
  }

#define PYOPENCL_CALL_GUARDED(NAME, ARGLIST) \
  { \
    PYOPENCL_PRINT_CALL_TRACE(#NAME); \
    cl_int status_code; \
    status_code = NAME ARGLIST; \
    if (status_code != CL_SUCCESS) \
      throw pyopencl::error(#NAME, status_code);\
  }
#define PYOPENCL_CALL_GUARDED_CLEANUP(NAME, ARGLIST) \
  { \
    PYOPENCL_PRINT_CALL_TRACE(#NAME); \
    cl_int status_code; \
    status_code = NAME ARGLIST; \
    if (status_code != CL_SUCCESS) \
      std::cerr \
        << "PyOpenCL WARNING: a clean-up operation failed (dead context maybe?)" \
        << std::endl \
        << #NAME " failed with code " << status_code \
        << std::endl; \
  }

// }}}


// {{{ get_info helpers
#define PYOPENCL_GET_OPAQUE_INFO(WHAT, FIRST_ARG, SECOND_ARG, CL_TYPE, TYPE) \
  { \
    CL_TYPE param_value; \
    PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \
          (FIRST_ARG, SECOND_ARG, sizeof(param_value), &param_value, 0)); \
    if (param_value) \
      return py::object(handle_from_new_ptr( \
            new TYPE(param_value, /*retain*/ true))); \
    else \
      return py::none(); \
  }

#define PYOPENCL_GET_VEC_INFO(WHAT, FIRST_ARG, SECOND_ARG, RES_VEC) \
  { \
    size_t size; \
    PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \
        (FIRST_ARG, SECOND_ARG, 0, 0, &size)); \
    \
    RES_VEC.resize(size / sizeof(RES_VEC.front())); \
    \
    PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \
        (FIRST_ARG, SECOND_ARG, size, \
         RES_VEC.empty( ) ? nullptr : &RES_VEC.front(), &size)); \
  }

#define PYOPENCL_GET_STR_INFO(WHAT, FIRST_ARG, SECOND_ARG) \
  { \
    size_t param_value_size; \
    PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \
        (FIRST_ARG, SECOND_ARG, 0, 0, &param_value_size)); \
    \
    std::vector<char> param_value(param_value_size); \
    PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \
        (FIRST_ARG, SECOND_ARG, param_value_size,  \
         param_value.empty( ) ? nullptr : &param_value.front(), &param_value_size)); \
    \
    return py::cast( \
        param_value.empty( ) ? "" : std::string(&param_value.front(), param_value_size-1)); \
  }




#define PYOPENCL_GET_TYPED_INFO(WHAT, FIRST_ARG, SECOND_ARG, TYPE) \
  { \
    TYPE param_value; \
    PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \
        (FIRST_ARG, SECOND_ARG, sizeof(param_value), &param_value, 0)); \
    return py::cast(param_value); \
  }

// }}}


// {{{ event helpers --------------------------------------------------------------
#define PYOPENCL_PARSE_WAIT_FOR \
    cl_uint num_events_in_wait_list = 0; \
    std::vector<cl_event> event_wait_list; \
    \
    if (py_wait_for.ptr() != Py_None) \
    { \
      for (py::handle evt: py_wait_for) \
      { \
        event_wait_list.push_back(py::cast<const event &>(evt).data()); \
        ++num_events_in_wait_list; \
      } \
    }

#define PYOPENCL_WAITLIST_ARGS \
    num_events_in_wait_list, (num_events_in_wait_list == 0) ? nullptr : &event_wait_list.front()

#define PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, obj) \
    try \
    { \
      return new nanny_event(evt, false, obj); \
    } \
    catch (...) \
    { \
      clReleaseEvent(evt); \
      throw; \
    }

#define PYOPENCL_RETURN_NEW_EVENT(evt) \
    try \
    { \
      return new event(evt, false); \
    } \
    catch (...) \
    { \
      clReleaseEvent(evt); \
      throw; \
    }

// }}}


// {{{ equality testing

#define PYOPENCL_EQUALITY_TESTS(cls) \
    bool operator==(cls const &other) const \
    { return data() == other.data(); } \
    bool operator!=(cls const &other) const \
    { return data() != other.data(); } \
    long hash() const \
    { return (long) (intptr_t) data(); }

// }}}


namespace pyopencl
{
  class program;
  class command_queue;

  // {{{ error

  class error : public std::runtime_error
  {
    private:
      std::string m_routine;
      cl_int m_code;

      // This is here because clLinkProgram returns a program
      // object *just* so that there is somewhere for it to
      // stuff the linker logs. :/
      bool m_program_initialized;
      cl_program m_program;

    public:
      error(std::string const &routine, cl_int c, std::string const &msg="")
        : std::runtime_error(msg), m_routine(routine), m_code(c),
        m_program_initialized(false), m_program(nullptr)
      { }

      error(const char *routine, cl_program prg, cl_int c,
          const char *msg="")
        : std::runtime_error(msg), m_routine(routine), m_code(c),
        m_program_initialized(true), m_program(prg)
      { }

      virtual ~error()
      {
        if (m_program_initialized)
          clReleaseProgram(m_program);
      }

      const std::string &routine() const
      {
        return m_routine;
      }

      cl_int code() const
      {
        return m_code;
      }

      bool is_out_of_memory() const
      {
        return (code() == CL_MEM_OBJECT_ALLOCATION_FAILURE
            || code() == CL_OUT_OF_RESOURCES
            || code() == CL_OUT_OF_HOST_MEMORY);
      }

      program *get_program() const;

      // FIXME: Inheritance from builtin_exception confuses nanobind
      const char *err_what()
      {
        return what();
      }

      void set_error() const
      {
        py::object err_obj = py::cast(*this);
        py::object errors_mod = py::module_::import_("pyopencl._errors");

        if (code() == CL_MEM_OBJECT_ALLOCATION_FAILURE)
          PyErr_SetObject(errors_mod.attr("MemoryError").ptr(), err_obj.ptr());
        else if (code() <= CL_INVALID_VALUE)
          PyErr_SetObject(errors_mod.attr("LogicError").ptr(), err_obj.ptr());
        else if (code() > CL_INVALID_VALUE && code() < CL_SUCCESS)
          PyErr_SetObject(errors_mod.attr("RuntimeError").ptr(), err_obj.ptr());
        else
          PyErr_SetObject(errors_mod.attr("Error").ptr(), err_obj.ptr());
      }

  };

  // }}}


  // {{{ utility functions

  inline bool is_queue_out_of_order(cl_command_queue queue)
  {
      cl_command_queue_properties param_value;
      PYOPENCL_CALL_GUARDED(clGetCommandQueueInfo,
          (queue, CL_QUEUE_PROPERTIES, sizeof(param_value), &param_value, 0));
      return param_value & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
  }

  // }}}


  // {{{ buffer interface helper

  class py_buffer_wrapper : public noncopyable
  {
    private:
      bool m_initialized;

      public:
        Py_buffer m_buf;

    py_buffer_wrapper()
      : m_initialized(false)
    {}

    void get(PyObject *obj, int flags)
    {
#ifdef PYPY_VERSION
      // work around https://bitbucket.org/pypy/pypy/issues/2873
      if (flags & PyBUF_ANY_CONTIGUOUS)
      {
        int flags_wo_cont = flags & ~PyBUF_ANY_CONTIGUOUS;
        if (PyObject_GetBuffer(obj, &m_buf, flags_wo_cont | PyBUF_C_CONTIGUOUS))
        {
          PyErr_Clear();
          if (PyObject_GetBuffer(obj, &m_buf, flags_wo_cont | PyBUF_F_CONTIGUOUS))
            throw py::python_error();
        }
      }
      else
#endif
      if (PyObject_GetBuffer(obj, &m_buf, flags))
        throw py::python_error();

      m_initialized = true;
    }

    virtual ~py_buffer_wrapper()
    {
      if (m_initialized)
        PyBuffer_Release(&m_buf);
    }
  };


  // }}}

  inline
  py::tuple get_cl_header_version()
  {
    return py::make_tuple(
        PYOPENCL_CL_VERSION >> (3*4),
        (PYOPENCL_CL_VERSION >> (1*4)) & 0xff
        );
  }


  // {{{ platform

  class platform : noncopyable
  {
    private:
      cl_platform_id m_platform;

    public:
      platform(cl_platform_id pid)
      : m_platform(pid)
      { }

      platform(cl_platform_id pid, bool /*retain (ignored)*/)
      : m_platform(pid)
      { }

      cl_platform_id data() const
      {
        return m_platform;
      }

      PYOPENCL_EQUALITY_TESTS(platform);

      py::object get_info(cl_platform_info param_name) const
      {
        switch (param_name)
        {
          case CL_PLATFORM_PROFILE:
          case CL_PLATFORM_VERSION:
          case CL_PLATFORM_NAME:
          case CL_PLATFORM_VENDOR:
#if !(defined(CL_PLATFORM_NVIDIA) && CL_PLATFORM_NVIDIA == 0x3001)
          case CL_PLATFORM_EXTENSIONS:
#endif
            PYOPENCL_GET_STR_INFO(Platform, m_platform, param_name);

#if PYOPENCL_CL_VERSION >= 0x2010
          case CL_PLATFORM_HOST_TIMER_RESOLUTION:
            PYOPENCL_GET_TYPED_INFO(Platform, m_platform, param_name, cl_ulong);
#endif
#if PYOPENCL_CL_VERSION >= 0x3000
          case CL_PLATFORM_NUMERIC_VERSION:
            PYOPENCL_GET_TYPED_INFO(Platform, m_platform, param_name, cl_version);
          case CL_PLATFORM_EXTENSIONS_WITH_VERSION:
            {
              std::vector<cl_name_version> result;
              PYOPENCL_GET_VEC_INFO(Platform, m_platform, param_name, result);
              PYOPENCL_RETURN_VECTOR(cl_name_version, result);
            }
#endif
          default:
            throw error("Platform.get_info", CL_INVALID_VALUE);
        }
      }

      py::list get_devices(cl_device_type devtype);
  };




  inline
  py::list get_platforms()
  {
    cl_uint num_platforms = 0;
    PYOPENCL_CALL_GUARDED(clGetPlatformIDs, (0, 0, &num_platforms));

    std::vector<cl_platform_id> platforms(num_platforms);
    PYOPENCL_CALL_GUARDED(clGetPlatformIDs,
        (num_platforms, platforms.empty( ) ? nullptr : &platforms.front(), &num_platforms));

    py::list result;
    for (cl_platform_id pid: platforms)
      result.append(handle_from_new_ptr(
            new platform(pid)));

    return result;
  }

  // }}}


  // {{{ device

  class device : noncopyable
  {
    public:
      enum reference_type_t {
        REF_NOT_OWNABLE,
#if PYOPENCL_CL_VERSION >= 0x1020
        REF_CL_1_2,
#endif
      };
    private:
      cl_device_id m_device;
      reference_type_t m_ref_type;

    public:
      device(cl_device_id did)
      : m_device(did), m_ref_type(REF_NOT_OWNABLE)
      { }

      device(cl_device_id did, bool retain, reference_type_t ref_type=REF_NOT_OWNABLE)
      : m_device(did), m_ref_type(ref_type)
      {
        if (retain && ref_type != REF_NOT_OWNABLE)
        {
          if (false)
          { }

#if PYOPENCL_CL_VERSION >= 0x1020
          else if (ref_type == REF_CL_1_2)
          {
            PYOPENCL_CALL_GUARDED(clRetainDevice, (did));
          }
#endif

          else
            throw error("Device", CL_INVALID_VALUE,
                "cannot own references to devices when device fission or CL 1.2 is not available");
        }
      }

      ~device()
      {
#if PYOPENCL_CL_VERSION >= 0x1020
        if (m_ref_type == REF_CL_1_2)
          PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseDevice, (m_device));
#endif
      }

      cl_device_id data() const
      {
        return m_device;
      }

      PYOPENCL_EQUALITY_TESTS(device);

      py::object get_info(cl_device_info param_name) const
      {
#define DEV_GET_INT_INF(TYPE) \
        PYOPENCL_GET_TYPED_INFO(Device, m_device, param_name, TYPE);

        switch (param_name)
        {
          case CL_DEVICE_TYPE: DEV_GET_INT_INF(cl_device_type);
          case CL_DEVICE_VENDOR_ID: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_MAX_COMPUTE_UNITS: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_MAX_WORK_GROUP_SIZE: DEV_GET_INT_INF(size_t);

          case CL_DEVICE_MAX_WORK_ITEM_SIZES:
            {
              std::vector<size_t> result;
              PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result);
              PYOPENCL_RETURN_VECTOR(size_t, result);
            }

          case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE: DEV_GET_INT_INF(cl_uint);

          case CL_DEVICE_MAX_CLOCK_FREQUENCY: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_ADDRESS_BITS: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_MAX_READ_IMAGE_ARGS: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_MAX_WRITE_IMAGE_ARGS: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_MAX_MEM_ALLOC_SIZE: DEV_GET_INT_INF(cl_ulong);
          case CL_DEVICE_IMAGE2D_MAX_WIDTH: DEV_GET_INT_INF(size_t);
          case CL_DEVICE_IMAGE2D_MAX_HEIGHT: DEV_GET_INT_INF(size_t);
          case CL_DEVICE_IMAGE3D_MAX_WIDTH: DEV_GET_INT_INF(size_t);
          case CL_DEVICE_IMAGE3D_MAX_HEIGHT: DEV_GET_INT_INF(size_t);
          case CL_DEVICE_IMAGE3D_MAX_DEPTH: DEV_GET_INT_INF(size_t);
          case CL_DEVICE_IMAGE_SUPPORT: DEV_GET_INT_INF(cl_bool);
          case CL_DEVICE_MAX_PARAMETER_SIZE: DEV_GET_INT_INF(size_t);
          case CL_DEVICE_MAX_SAMPLERS: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_MEM_BASE_ADDR_ALIGN: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_SINGLE_FP_CONFIG: DEV_GET_INT_INF(cl_device_fp_config);
#ifdef CL_DEVICE_DOUBLE_FP_CONFIG
          case CL_DEVICE_DOUBLE_FP_CONFIG: DEV_GET_INT_INF(cl_device_fp_config);
#endif
#ifdef CL_DEVICE_HALF_FP_CONFIG
          case CL_DEVICE_HALF_FP_CONFIG: DEV_GET_INT_INF(cl_device_fp_config);
#endif

          case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE: DEV_GET_INT_INF(cl_device_mem_cache_type);
          case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_GLOBAL_MEM_CACHE_SIZE: DEV_GET_INT_INF(cl_ulong);
          case CL_DEVICE_GLOBAL_MEM_SIZE: DEV_GET_INT_INF(cl_ulong);

          case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: DEV_GET_INT_INF(cl_ulong);
          case CL_DEVICE_MAX_CONSTANT_ARGS: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_LOCAL_MEM_TYPE: DEV_GET_INT_INF(cl_device_local_mem_type);
          case CL_DEVICE_LOCAL_MEM_SIZE: DEV_GET_INT_INF(cl_ulong);
          case CL_DEVICE_ERROR_CORRECTION_SUPPORT: DEV_GET_INT_INF(cl_bool);
          case CL_DEVICE_PROFILING_TIMER_RESOLUTION: DEV_GET_INT_INF(size_t);
          case CL_DEVICE_ENDIAN_LITTLE: DEV_GET_INT_INF(cl_bool);
          case CL_DEVICE_AVAILABLE: DEV_GET_INT_INF(cl_bool);
          case CL_DEVICE_COMPILER_AVAILABLE: DEV_GET_INT_INF(cl_bool);
          case CL_DEVICE_EXECUTION_CAPABILITIES: DEV_GET_INT_INF(cl_device_exec_capabilities);
#if PYOPENCL_CL_VERSION >= 0x2000
          case CL_DEVICE_QUEUE_ON_HOST_PROPERTIES: DEV_GET_INT_INF(cl_command_queue_properties);
#else
          case CL_DEVICE_QUEUE_PROPERTIES: DEV_GET_INT_INF(cl_command_queue_properties);
#endif

          case CL_DEVICE_NAME:
          case CL_DEVICE_VENDOR:
          case CL_DRIVER_VERSION:
          case CL_DEVICE_PROFILE:
          case CL_DEVICE_VERSION:
          case CL_DEVICE_EXTENSIONS:
            PYOPENCL_GET_STR_INFO(Device, m_device, param_name);

          case CL_DEVICE_PLATFORM:
            PYOPENCL_GET_OPAQUE_INFO(Device, m_device, param_name, cl_platform_id, platform);

#if PYOPENCL_CL_VERSION >= 0x1010
          case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF: DEV_GET_INT_INF(cl_uint);

          case CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_NATIVE_VECTOR_WIDTH_INT: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF: DEV_GET_INT_INF(cl_uint);

          case CL_DEVICE_HOST_UNIFIED_MEMORY: DEV_GET_INT_INF(cl_bool);
          case CL_DEVICE_OPENCL_C_VERSION:
            PYOPENCL_GET_STR_INFO(Device, m_device, param_name);
#endif
#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
          case CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV:
          case CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV:
          case CL_DEVICE_REGISTERS_PER_BLOCK_NV:
          case CL_DEVICE_WARP_SIZE_NV:
            DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_GPU_OVERLAP_NV:
          case CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV:
          case CL_DEVICE_INTEGRATED_MEMORY_NV:
            DEV_GET_INT_INF(cl_bool);
#endif
#ifdef CL_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT_NV
          case CL_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT_NV:
            DEV_GET_INT_INF(cl_uint);
#endif
#ifdef CL_DEVICE_PCI_BUS_ID_NV
          case CL_DEVICE_PCI_BUS_ID_NV:
            DEV_GET_INT_INF(cl_uint);
#endif
#ifdef CL_DEVICE_PCI_SLOT_ID_NV
          case CL_DEVICE_PCI_SLOT_ID_NV:
            DEV_GET_INT_INF(cl_uint);
#endif
#ifdef CL_DEVICE_PCI_DOMAIN_ID_NV
          case CL_DEVICE_PCI_DOMAIN_ID_NV:
            DEV_GET_INT_INF(cl_uint);
#endif
#ifdef CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD
          case CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD: DEV_GET_INT_INF(cl_bool);
#endif
#ifdef CL_DEVICE_GFXIP_MAJOR_AMD
          case CL_DEVICE_GFXIP_MAJOR_AMD: DEV_GET_INT_INF(cl_uint);
#endif
#ifdef CL_DEVICE_GFXIP_MINOR_AMD
          case CL_DEVICE_GFXIP_MINOR_AMD: DEV_GET_INT_INF(cl_uint);
#endif
#ifdef CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD
          case CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD: DEV_GET_INT_INF(cl_uint);
#endif
#if PYOPENCL_CL_VERSION >= 0x1020
          case CL_DEVICE_LINKER_AVAILABLE: DEV_GET_INT_INF(cl_bool);
          case CL_DEVICE_BUILT_IN_KERNELS:
            PYOPENCL_GET_STR_INFO(Device, m_device, param_name);
          case CL_DEVICE_IMAGE_MAX_BUFFER_SIZE: DEV_GET_INT_INF(size_t);
          case CL_DEVICE_IMAGE_MAX_ARRAY_SIZE: DEV_GET_INT_INF(size_t);
          case CL_DEVICE_PARENT_DEVICE:
            PYOPENCL_GET_OPAQUE_INFO(Device, m_device, param_name, cl_device_id, device);
          case CL_DEVICE_PARTITION_MAX_SUB_DEVICES: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_PARTITION_TYPE:
          case CL_DEVICE_PARTITION_PROPERTIES:
            {
              std::vector<cl_device_partition_property> result;
              PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result);
              PYOPENCL_RETURN_VECTOR(cl_device_partition_property, result);
            }
          case CL_DEVICE_PARTITION_AFFINITY_DOMAIN:
            {
#if defined(__GNUG__) && !defined(__clang__)
#pragma GCC diagnostic push
// what's being ignored here is an alignment attribute to native size, which
// shouldn't matter on the relevant ABIs that I'm aware of.
#pragma GCC diagnostic ignored "-Wignored-attributes"
#endif
              std::vector<cl_device_affinity_domain> result;
#if defined(__GNUG__) && !defined(__clang__)
#pragma GCC diagnostic pop
#endif
              PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result);
              PYOPENCL_RETURN_VECTOR(cl_device_affinity_domain, result);
            }
          case CL_DEVICE_REFERENCE_COUNT: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_PREFERRED_INTEROP_USER_SYNC: DEV_GET_INT_INF(cl_bool);
          case CL_DEVICE_PRINTF_BUFFER_SIZE: DEV_GET_INT_INF(cl_bool);
#endif
// {{{ AMD dev attrs cl_amd_device_attribute_query
//
// types of AMD dev attrs divined from
// https://github.com/KhronosGroup/OpenCL-CLHPP/blob/3b03738fef487378b188d21cc5f2bae276aa8721/include/CL/opencl.hpp#L1471-L1500
#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
          case CL_DEVICE_PROFILING_TIMER_OFFSET_AMD: DEV_GET_INT_INF(cl_ulong);
#endif
#ifdef CL_DEVICE_TOPOLOGY_AMD
          case CL_DEVICE_TOPOLOGY_AMD:
            PYOPENCL_GET_TYPED_INFO(
                Device, m_device, param_name, cl_device_topology_amd);
#endif
#ifdef CL_DEVICE_BOARD_NAME_AMD
          case CL_DEVICE_BOARD_NAME_AMD: ;
            PYOPENCL_GET_STR_INFO(Device, m_device, param_name);
#endif
#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
          case CL_DEVICE_GLOBAL_FREE_MEMORY_AMD:
            {
              std::vector<size_t> result;
              PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result);
              PYOPENCL_RETURN_VECTOR(size_t, result);
            }
#endif
#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
          case CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD: DEV_GET_INT_INF(cl_uint);
#endif
#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
          case CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD: DEV_GET_INT_INF(cl_uint);
#endif
#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
          case CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD: DEV_GET_INT_INF(cl_uint);
#endif
#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
          case CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD: DEV_GET_INT_INF(cl_uint);
#endif
#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
          case CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD: DEV_GET_INT_INF(cl_uint);
#endif
#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
          case CL_DEVICE_LOCAL_MEM_BANKS_AMD: DEV_GET_INT_INF(cl_uint);
#endif
// FIXME: MISSING:
//
// CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD
// CL_DEVICE_GFXIP_MAJOR_AMD
// CL_DEVICE_GFXIP_MINOR_AMD
// CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD
// CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD
// CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD
// CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD
// CL_DEVICE_PCIE_ID_AMD

// }}}

#ifdef CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT
          case CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT: DEV_GET_INT_INF(cl_uint);
#endif
#if PYOPENCL_CL_VERSION >= 0x2000
          case CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE: DEV_GET_INT_INF(size_t);
          case CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES: DEV_GET_INT_INF(cl_command_queue_properties);
          case CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_MAX_ON_DEVICE_QUEUES: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_MAX_ON_DEVICE_EVENTS: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_SVM_CAPABILITIES: DEV_GET_INT_INF(cl_device_svm_capabilities);
          case CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE: DEV_GET_INT_INF(size_t);
          case CL_DEVICE_MAX_PIPE_ARGS: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_PIPE_MAX_PACKET_SIZE: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT: DEV_GET_INT_INF(cl_uint);
#endif
#if PYOPENCL_CL_VERSION >= 0x2010
          case CL_DEVICE_IL_VERSION:
            PYOPENCL_GET_STR_INFO(Device, m_device, param_name);
          case CL_DEVICE_MAX_NUM_SUB_GROUPS: DEV_GET_INT_INF(cl_uint);
          case CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: DEV_GET_INT_INF(cl_bool);
#endif
#if PYOPENCL_CL_VERSION >= 0x3000
          case CL_DEVICE_NUMERIC_VERSION: DEV_GET_INT_INF(cl_version);
          case CL_DEVICE_EXTENSIONS_WITH_VERSION:
          case CL_DEVICE_ILS_WITH_VERSION:
          case CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION:
          case CL_DEVICE_OPENCL_C_ALL_VERSIONS:
          case CL_DEVICE_OPENCL_C_FEATURES:
            {
              std::vector<cl_name_version> result;
              PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result);
              PYOPENCL_RETURN_VECTOR(cl_name_version, result);
            }
          case CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES: DEV_GET_INT_INF(cl_device_atomic_capabilities);
          case CL_DEVICE_ATOMIC_FENCE_CAPABILITIES: DEV_GET_INT_INF(cl_device_atomic_capabilities);
          case CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT: DEV_GET_INT_INF(cl_bool);
          case CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: DEV_GET_INT_INF(size_t);
          case CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT: DEV_GET_INT_INF(cl_bool);
          case CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT: DEV_GET_INT_INF(cl_bool);

#ifdef CL_DEVICE_DEVICE_ENQUEUE_SUPPORT
          case CL_DEVICE_DEVICE_ENQUEUE_SUPPORT: DEV_GET_INT_INF(cl_bool);
#endif
#ifdef CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES
          case CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES: DEV_GET_INT_INF(cl_device_device_enqueue_capabilities);
#endif

          case CL_DEVICE_PIPE_SUPPORT: DEV_GET_INT_INF(cl_bool);
#endif

#ifdef CL_DEVICE_ME_VERSION_INTEL
          case CL_DEVICE_ME_VERSION_INTEL: DEV_GET_INT_INF(cl_uint);
#endif
#ifdef CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM
          case CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM: DEV_GET_INT_INF(cl_uint);
#endif
#ifdef CL_DEVICE_PAGE_SIZE_QCOM
          case CL_DEVICE_PAGE_SIZE_QCOM: DEV_GET_INT_INF(cl_uint);
#endif
#ifdef CL_DEVICE_SPIR_VERSIONS
          case CL_DEVICE_SPIR_VERSIONS:
            PYOPENCL_GET_STR_INFO(Device, m_device, param_name);
#endif
#ifdef CL_DEVICE_CORE_TEMPERATURE_ALTERA
          case CL_DEVICE_CORE_TEMPERATURE_ALTERA: DEV_GET_INT_INF(cl_int);
#endif

#ifdef CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL
          case CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL:
            {
              std::vector<cl_uint> result;
              PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result);
              PYOPENCL_RETURN_VECTOR(cl_uint, result);
            }
#endif
#ifdef CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL
          case CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL: DEV_GET_INT_INF(cl_uint);
#endif

          default:
            throw error("Device.get_info", CL_INVALID_VALUE);
        }
      }

#if PYOPENCL_CL_VERSION >= 0x1020
      py::list create_sub_devices(py::object py_properties)
      {
        std::vector<cl_device_partition_property> properties;

        COPY_PY_LIST(cl_device_partition_property, properties);
        properties.push_back(0);

        cl_device_partition_property *props_ptr
          = properties.empty( ) ? nullptr : &properties.front();

        cl_uint num_entries;
        PYOPENCL_CALL_GUARDED(clCreateSubDevices,
            (m_device, props_ptr, 0, nullptr, &num_entries));

        std::vector<cl_device_id> result;
        result.resize(num_entries);

        PYOPENCL_CALL_GUARDED(clCreateSubDevices,
            (m_device, props_ptr, num_entries, &result.front(), nullptr));

        py::list py_result;
        for (cl_device_id did: result)
          py_result.append(handle_from_new_ptr(
                new pyopencl::device(did, /*retain*/true,
                  device::REF_CL_1_2)));
        return py_result;
      }
#endif

#if PYOPENCL_CL_VERSION >= 0x2010
      py::tuple device_and_host_timer() const
      {
        cl_ulong device_timestamp, host_timestamp;
        PYOPENCL_CALL_GUARDED(clGetDeviceAndHostTimer,
            (m_device, &device_timestamp, &host_timestamp));
        return py::make_tuple(device_timestamp, host_timestamp);
      }

      cl_ulong host_timer() const
      {
        cl_ulong host_timestamp;
        PYOPENCL_CALL_GUARDED(clGetHostTimer,
            (m_device, &host_timestamp));
        return host_timestamp;
      }
#endif
  };




  inline py::list platform::get_devices(cl_device_type devtype)
  {
    cl_uint num_devices = 0;
    PYOPENCL_PRINT_CALL_TRACE("clGetDeviceIDs");
    {
      cl_int status_code;
      status_code = clGetDeviceIDs(m_platform, devtype, 0, 0, &num_devices);
      if (status_code == CL_DEVICE_NOT_FOUND)
        num_devices = 0;
      else if (status_code != CL_SUCCESS) \
        throw pyopencl::error("clGetDeviceIDs", status_code);
    }

    if (num_devices == 0)
      return py::list();

    std::vector<cl_device_id> devices(num_devices);
    PYOPENCL_CALL_GUARDED(clGetDeviceIDs,
        (m_platform, devtype,
         num_devices, devices.empty( ) ? nullptr : &devices.front(), &num_devices));

    py::list result;
    for (cl_device_id did: devices)
      result.append(handle_from_new_ptr(
            new device(did)));

    return result;
  }

  // }}}


  // {{{ context

  class context : public noncopyable
  {
    private:
      cl_context m_context;

    public:
      context(cl_context ctx, bool retain)
        : m_context(ctx)
      {
        if (retain)
          PYOPENCL_CALL_GUARDED(clRetainContext, (ctx));
      }

      ~context()
      {
        PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseContext,
            (m_context));
      }

      cl_context data() const
      {
        return m_context;
      }

      PYOPENCL_EQUALITY_TESTS(context);

      py::object get_info(cl_context_info param_name) const
      {
        switch (param_name)
        {
          case CL_CONTEXT_REFERENCE_COUNT:
            PYOPENCL_GET_TYPED_INFO(
                Context, m_context, param_name, cl_uint);

          case CL_CONTEXT_DEVICES:
            {
              std::vector<cl_device_id> result;
              PYOPENCL_GET_VEC_INFO(Context, m_context, param_name, result);

              py::list py_result;
              for (cl_device_id did: result)
                py_result.append(handle_from_new_ptr(
                      new pyopencl::device(did)));
              return py_result;
            }

          case CL_CONTEXT_PROPERTIES:
            {
              std::vector<cl_context_properties> result;
              PYOPENCL_GET_VEC_INFO(Context, m_context, param_name, result);

              py::list py_result;
              for (size_t i = 0; i < result.size(); i+=2)
              {
                cl_context_properties key = result[i];
                py::object value;
                switch (key)
                {
                  case CL_CONTEXT_PLATFORM:
                    {
                      value = py::object(
                          handle_from_new_ptr(new platform(
                            reinterpret_cast<cl_platform_id>(result[i+1]))));
                      break;
                    }

#if defined(PYOPENCL_GL_SHARING_VERSION) && (PYOPENCL_GL_SHARING_VERSION >= 1)
#if defined(__APPLE__) && defined(HAVE_GL)
                  case CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE:
#else
                  case CL_GL_CONTEXT_KHR:
                  case CL_EGL_DISPLAY_KHR:
                  case CL_GLX_DISPLAY_KHR:
                  case CL_WGL_HDC_KHR:
                  case CL_CGL_SHAREGROUP_KHR:
#endif
                    value = py::cast(result[i+1]);
                    break;

#endif
                  case 0:
                    break;

                  default:
                    throw error("Context.get_info", CL_INVALID_VALUE,
                        "unknown context_property key encountered");
                }

                py_result.append(py::make_tuple(result[i], value));
              }
              return py_result;
            }

#if PYOPENCL_CL_VERSION >= 0x1010
          case CL_CONTEXT_NUM_DEVICES:
            PYOPENCL_GET_TYPED_INFO(
                Context, m_context, param_name, cl_uint);
#endif

          default:
            throw error("Context.get_info", CL_INVALID_VALUE);
        }
      }


      // not exposed to python
      int get_hex_platform_version() const
      {
        std::vector<cl_device_id> devices;
        PYOPENCL_GET_VEC_INFO(Context, m_context, CL_CONTEXT_DEVICES, devices);

        if (devices.size() == 0)
          throw error("Context._get_hex_version", CL_INVALID_VALUE,
              "platform has no devices");

        cl_platform_id plat;

        PYOPENCL_CALL_GUARDED(clGetDeviceInfo,
            (devices[0], CL_DEVICE_PLATFORM, sizeof(plat), &plat, nullptr));

        std::string plat_version;
        {
          size_t param_value_size;
          PYOPENCL_CALL_GUARDED(clGetPlatformInfo,
              (plat, CL_PLATFORM_VERSION, 0, 0, &param_value_size));

          std::vector<char> param_value(param_value_size);
          PYOPENCL_CALL_GUARDED(clGetPlatformInfo,
              (plat, CL_PLATFORM_VERSION, param_value_size,
               param_value.empty( ) ? nullptr : &param_value.front(), &param_value_size));

          plat_version =
              param_value.empty( ) ? "" : std::string(&param_value.front(), param_value_size-1);
        }

        int major_ver, minor_ver;
        errno = 0;
        int match_count = sscanf(plat_version.c_str(), "OpenCL %d.%d ", &major_ver, &minor_ver);
        if (errno || match_count != 2)
          throw error("Context._get_hex_platform_version", CL_INVALID_VALUE,
              "Platform version string did not have expected format");

        return major_ver << 12 | minor_ver << 4;
      }

#if PYOPENCL_CL_VERSION >= 0x2010
      void set_default_device_command_queue(device const &dev, command_queue const &queue);
#endif
  };


  inline
  std::vector<cl_context_properties> parse_context_properties(
      py::object py_properties)
  {
    std::vector<cl_context_properties> props;

    if (py_properties.ptr() != Py_None)
    {
      for (py::handle prop_tuple_py: py_properties)
      {
        py::tuple prop_tuple(py::cast<py::tuple>(prop_tuple_py));

        if (len(prop_tuple) != 2)
          throw error("Context", CL_INVALID_VALUE, "property tuple must have length 2");
        cl_context_properties prop = py::cast<cl_context_properties>(prop_tuple[0]);
        props.push_back(prop);

        if (prop == CL_CONTEXT_PLATFORM)
        {
          props.push_back(
              reinterpret_cast<cl_context_properties>(
                py::cast<const platform &>(prop_tuple[1]).data()));
        }
#if defined(PYOPENCL_GL_SHARING_VERSION) && (PYOPENCL_GL_SHARING_VERSION >= 1)
#if defined(_WIN32)
       else if (prop == CL_WGL_HDC_KHR)
       {
         // size_t is a stand-in for HANDLE, hopefully has the same size.
         size_t hnd = py::cast<size_t>(prop_tuple[1]);
         props.push_back(hnd);
       }
#endif
       else if (
#if defined(__APPLE__) && defined(HAVE_GL)
            prop == CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE
#else
            prop == CL_GL_CONTEXT_KHR
            || prop == CL_EGL_DISPLAY_KHR
            || prop == CL_GLX_DISPLAY_KHR
            || prop == CL_CGL_SHAREGROUP_KHR
#endif
           )
       {
          py::object ctypes = py::module_::import_("ctypes");
          py::object prop = prop_tuple[1], c_void_p = ctypes.attr("c_void_p");
          py::object ptr = ctypes.attr("cast")(prop, c_void_p);
          props.push_back(py::cast<cl_context_properties>(ptr.attr("value")));
       }
#endif
        else
          throw error("Context", CL_INVALID_VALUE, "invalid context property");
      }
      props.push_back(0);
    }

    return props;
  }


  inline
  void create_context_inner(context *self, py::object py_devices, py::object py_properties,
      py::object py_dev_type)
  {
    std::vector<cl_context_properties> props
      = parse_context_properties(py_properties);

    cl_context_properties *props_ptr
      = props.empty( ) ? nullptr : &props.front();

    cl_int status_code;

    cl_context ctx;

    // from device list
    if (py_devices.ptr() != Py_None)
    {
      if (py_dev_type.ptr() != Py_None)
        throw error("Context", CL_INVALID_VALUE,
            "one of 'devices' or 'dev_type' must be None");

      std::vector<cl_device_id> devices;
      for (py::handle py_dev: py_devices)
        devices.push_back(py::cast<const device &>(py_dev).data());

      PYOPENCL_PRINT_CALL_TRACE("clCreateContext");
      ctx = clCreateContext(
          props_ptr,
          devices.size(),
          devices.empty( ) ? nullptr : &devices.front(),
          0, 0, &status_code);
    }
    // from dev_type
    else
    {
      cl_device_type dev_type = CL_DEVICE_TYPE_DEFAULT;
      if (py_dev_type.ptr() != Py_None)
        dev_type = py::cast<cl_device_type>(py_dev_type);

      PYOPENCL_PRINT_CALL_TRACE("clCreateContextFromType");
      ctx = clCreateContextFromType(props_ptr, dev_type, 0, 0, &status_code);
    }

    if (status_code != CL_SUCCESS)
      throw pyopencl::error("Context", status_code);

    try
    {
      new (self) context(ctx, false);
    }
    catch (...)
    {
      PYOPENCL_CALL_GUARDED(clReleaseContext, (ctx));
      throw;
    }
  }


  inline
  void create_context(context *self, py::object py_devices, py::object py_properties,
      py::object py_dev_type)
  {
    PYOPENCL_RETRY_RETURN_IF_MEM_ERROR(
      create_context_inner(self, py_devices, py_properties, py_dev_type);
    )
  }

  // }}}


  // {{{ command_queue

  class command_queue
  {
    private:
      cl_command_queue m_queue;
      // m_finalized==True indicates that this command queue should no longer
      // be used. An example of this is if a command queue is used as a context
      // manager, after the 'with' block exits.
      //
      // This mechanism is not foolproof, as it is perfectly possible to create
      // other Python proxy objects referring to the same underlying
      // cl_command_queue. Even so, this ought to flag a class of potentially
      // very damaging synchronization bugs.
      bool m_finalized;

    public:
      command_queue(cl_command_queue q, bool retain)
        : m_queue(q), m_finalized(false)
      {
        if (retain)
          PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (q));
      }

      command_queue(command_queue const &src)
        : m_queue(src.m_queue), m_finalized(false)
      {
        PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (m_queue));
      }

      command_queue(
          const context &ctx,
          const device *py_dev=nullptr,
          py::object py_props=py::none())
      : m_finalized(false)
      {
        cl_device_id dev;
        if (py_dev)
          dev = py_dev->data();
        else
        {
          std::vector<cl_device_id> devs;
          PYOPENCL_GET_VEC_INFO(Context, ctx.data(), CL_CONTEXT_DEVICES, devs);
          if (devs.size() == 0)
            throw pyopencl::error("CommandQueue", CL_INVALID_VALUE,
                "context doesn't have any devices? -- don't know which one to default to");
          dev = devs[0];
        }

        int hex_plat_version = ctx.get_hex_platform_version();

        bool props_given_as_numeric;
        cl_command_queue_properties num_props;
        if (py_props.is_none())
        {
          num_props = 0;
          props_given_as_numeric = true;
        }
        else
        {
          try
          {
            num_props = py::cast<cl_command_queue_properties>(py_props);
            props_given_as_numeric = true;
          }
          catch (py::cast_error &)
          {
            props_given_as_numeric = false;
          }
        }

        if (props_given_as_numeric)
        {
#if PYOPENCL_CL_VERSION >= 0x2000
          if (hex_plat_version  >= 0x2000)
          {
            cl_queue_properties props_list[] = { CL_QUEUE_PROPERTIES, num_props, 0 };

            cl_int status_code;

            PYOPENCL_PRINT_CALL_TRACE("clCreateCommandQueueWithProperties");
            m_queue = clCreateCommandQueueWithProperties(
                ctx.data(), dev, props_list, &status_code);

            if (status_code != CL_SUCCESS)
              throw pyopencl::error("CommandQueue", status_code);
          }
          else
#endif
          {
            cl_int status_code;

            PYOPENCL_PRINT_CALL_TRACE("clCreateCommandQueue");
#if defined(__GNUG__) && !defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
            m_queue = clCreateCommandQueue(
                ctx.data(), dev, num_props, &status_code);
#if defined(__GNUG__) && !defined(__clang__)
#pragma GCC diagnostic pop
#endif
            if (status_code != CL_SUCCESS)
              throw pyopencl::error("CommandQueue", status_code);
          }
        }
        else
        {
#if PYOPENCL_CL_VERSION < 0x2000
            throw error("CommandQueue", CL_INVALID_VALUE,
                "queue properties given as an iterable, "
                "which is only allowed when PyOpenCL was built "
                "against an OpenCL 2+ header");
#else
          if (hex_plat_version  < 0x2000)
          {
            std::cerr <<
                "queue properties given as an iterable, "
                "which uses an OpenCL 2+-only interface, "
                "but the context's platform does not "
                "declare OpenCL 2 support. Proceeding "
                "as requested, but the next thing you see "
                "may be a crash." << std:: endl;
          }

          PYOPENCL_STACK_CONTAINER(cl_queue_properties, props, py::len(py_props) + 1);
          {
            size_t i = 0;
            for (auto prop: py_props)
              props[i++] = py::cast<cl_queue_properties>(prop);
            props[i++] = 0;
          }

          cl_int status_code;
          PYOPENCL_PRINT_CALL_TRACE("clCreateCommandQueueWithProperties");
          m_queue = clCreateCommandQueueWithProperties(
              ctx.data(), dev, PYOPENCL_STACK_CONTAINER_GET_PTR(props), &status_code);

          if (status_code != CL_SUCCESS)
            throw pyopencl::error("CommandQueue", status_code);
#endif
        }
      }

      ~command_queue()
      {
        PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseCommandQueue,
            (m_queue));
      }

      const cl_command_queue data() const
      {
        if (m_finalized)
        {
          auto mod_warnings(py::module_::import_("warnings"));
          auto mod_cl(py::module_::import_("pyopencl"));
          mod_warnings.attr("warn")(
              "Command queue used after exit of context manager. "
              "This is deprecated and will stop working in 2023.",
              mod_cl.attr("CommandQueueUsedAfterExit")
              );
        }
        return m_queue;
      }

      void finalize()
      {
        m_finalized = true;
      }

      PYOPENCL_EQUALITY_TESTS(command_queue);

      py::object get_info(cl_command_queue_info param_name) const
      {
        switch (param_name)
        {
          case CL_QUEUE_CONTEXT:
            PYOPENCL_GET_OPAQUE_INFO(CommandQueue, m_queue, param_name,
                cl_context, context);
          case CL_QUEUE_DEVICE:
            PYOPENCL_GET_OPAQUE_INFO(CommandQueue, m_queue, param_name,
                cl_device_id, device);
          case CL_QUEUE_REFERENCE_COUNT:
            PYOPENCL_GET_TYPED_INFO(CommandQueue, m_queue, param_name,
                cl_uint);
          case CL_QUEUE_PROPERTIES:
            PYOPENCL_GET_TYPED_INFO(CommandQueue, m_queue, param_name,
                cl_command_queue_properties);
#if PYOPENCL_CL_VERSION >= 0x2000
          case CL_QUEUE_SIZE:
            PYOPENCL_GET_TYPED_INFO(CommandQueue, m_queue, param_name,
                cl_uint);
#endif
#if PYOPENCL_CL_VERSION >= 0x2010
          case CL_QUEUE_DEVICE_DEFAULT:
            PYOPENCL_GET_OPAQUE_INFO(
                CommandQueue, m_queue, param_name, cl_command_queue, command_queue);
#endif
#if PYOPENCL_CL_VERSION >= 0x3000
          case CL_QUEUE_PROPERTIES_ARRAY:
            {
              std::vector<cl_queue_properties> result;
              PYOPENCL_GET_VEC_INFO(CommandQueue, data(), param_name, result);
              PYOPENCL_RETURN_VECTOR(cl_queue_properties, result);
            }
#endif

          default:
            throw error("CommandQueue.get_info", CL_INVALID_VALUE);
        }
      }

      std::unique_ptr<context> get_context() const
      {
        cl_context param_value;
        PYOPENCL_CALL_GUARDED(clGetCommandQueueInfo,
            (data(), CL_QUEUE_CONTEXT, sizeof(param_value), &param_value, 0));
        return std::unique_ptr<context>(
            new context(param_value, /*retain*/ true));
      }

#if PYOPENCL_CL_VERSION < 0x1010
      cl_command_queue_properties set_property(
          cl_command_queue_properties prop,
          bool enable)
      {
        cl_command_queue_properties old_prop;
        PYOPENCL_CALL_GUARDED(clSetCommandQueueProperty,
            (data(), prop, PYOPENCL_CAST_BOOL(enable), &old_prop));
        return old_prop;
      }
#endif

      void flush()
      { PYOPENCL_CALL_GUARDED(clFlush, (data())); }
      void finish()
      {
        if (m_finalized) {
          return;
        } else {
          cl_command_queue queue = data();

          PYOPENCL_CALL_GUARDED_THREADED(clFinish, (queue));
        }
      }

      // not exposed to python
      int get_hex_device_version() const
      {
        cl_device_id dev;

        PYOPENCL_CALL_GUARDED(clGetCommandQueueInfo,
            (data(), CL_QUEUE_DEVICE, sizeof(dev), &dev, nullptr));

        std::string dev_version;
        {
          size_t param_value_size;
          PYOPENCL_CALL_GUARDED(clGetDeviceInfo,
              (dev, CL_DEVICE_VERSION, 0, 0, &param_value_size));

          std::vector<char> param_value(param_value_size);
          PYOPENCL_CALL_GUARDED(clGetDeviceInfo,
              (dev, CL_DEVICE_VERSION, param_value_size,
               param_value.empty( ) ? nullptr : &param_value.front(), &param_value_size));

          dev_version =
              param_value.empty( ) ? "" : std::string(&param_value.front(), param_value_size-1);
        }

        int major_ver, minor_ver;
        errno = 0;
        int match_count = sscanf(dev_version.c_str(), "OpenCL %d.%d ", &major_ver, &minor_ver);
        if (errno || match_count != 2)
          throw error("CommandQueue._get_hex_device_version", CL_INVALID_VALUE,
              "Platform version string did not have expected format");

        return major_ver << 12 | minor_ver << 4;
      }
  };

  // }}}


  // {{{ command_queue_ref

  // In contrast to command_queue, command_queue_ref is "nullable", i.e.
  // it is a RAII *optional* reference to a command queue.

  class command_queue_ref
  {
    private:
      bool m_valid;
      cl_command_queue m_queue;

    public:
      command_queue_ref()
        : m_valid(false)
      {}

      command_queue_ref(cl_command_queue queue)
        : m_valid(queue != nullptr), m_queue(queue)
      {
        // E.g. SVM allocations of size zero use a NULL queue. Tolerate that.
        if (m_valid)
          PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (m_queue));
      }

      command_queue_ref(command_queue_ref &&src) noexcept
        : m_valid(src.m_valid), m_queue(src.m_queue)
      {
        src.m_valid = false;
      }

      command_queue_ref(const command_queue_ref &src)
      : m_valid(src.m_valid), m_queue(src.m_queue)
      {
        // Note that there isn't anything per se wrong with this
        // copy constructor, the refcounting is just potentially
        // expensive.
        //
        // All code in current use moves these, it does not copy them,
        // so this should never get called.
        //
        // Unfortunately, we can't delete this copy constructor,
        // because we would like to return these from functions.
        // This makes at least gcc require copy constructors, even
        // if those are never called due to NRVO.
        std::cerr << "COPYING A COMMAND_QUEUE_REF." << std::endl;

        if (m_valid)
          PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (m_queue));
      }

      command_queue_ref &operator=(const command_queue_ref &) = delete;

      ~command_queue_ref()
      {
        reset();
      }

      bool is_valid() const
      {
        return m_valid;
      }

      cl_command_queue data() const
      {
        if (m_valid)
          return m_queue;
        else
          throw error("command_queue_ref.data", CL_INVALID_VALUE,
              "command_queue_ref is not valid");
      }

      void reset()
      {
        if (m_valid)
          PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseCommandQueue, (m_queue));
        m_valid = false;
      }

      void set(cl_command_queue queue)
      {
        if (!queue)
          throw error("command_queue_ref.set", CL_INVALID_VALUE,
              "cannot set to NULL command queue");

        if (m_valid)
          PYOPENCL_CALL_GUARDED(clReleaseCommandQueue, (m_queue));
        m_queue = queue;
        PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (m_queue));
        m_valid = true;
      }
  };

  // }}}


  // {{{ event/synchronization

  class event : noncopyable
  {
    private:
      cl_event m_event;

    public:
      event(cl_event event, bool retain)
        : m_event(event)
      {
        if (retain)
          PYOPENCL_CALL_GUARDED(clRetainEvent, (event));
      }

      event(event const &src)
        : m_event(src.m_event)
      { PYOPENCL_CALL_GUARDED(clRetainEvent, (m_event)); }

      virtual ~event()
      {
        PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseEvent,
            (m_event));
      }

      const cl_event data() const
      { return m_event; }

      PYOPENCL_EQUALITY_TESTS(event);

      py::object get_info(cl_event_info param_name) const
      {
        switch (param_name)
        {
          case CL_EVENT_COMMAND_QUEUE:
            PYOPENCL_GET_OPAQUE_INFO(Event, m_event, param_name,
                cl_command_queue, command_queue);
          case CL_EVENT_COMMAND_TYPE:
            PYOPENCL_GET_TYPED_INFO(Event, m_event, param_name,
                cl_command_type);
          case CL_EVENT_COMMAND_EXECUTION_STATUS:
            PYOPENCL_GET_TYPED_INFO(Event, m_event, param_name,
                cl_int);
          case CL_EVENT_REFERENCE_COUNT:
            PYOPENCL_GET_TYPED_INFO(Event, m_event, param_name,
                cl_uint);
#if PYOPENCL_CL_VERSION >= 0x1010
          case CL_EVENT_CONTEXT:
            PYOPENCL_GET_OPAQUE_INFO(Event, m_event, param_name,
                cl_context, context);
#endif

          default:
            throw error("Event.get_info", CL_INVALID_VALUE);
        }
      }

      py::object get_profiling_info(cl_profiling_info param_name) const
      {
        switch (param_name)
        {
          case CL_PROFILING_COMMAND_QUEUED:
          case CL_PROFILING_COMMAND_SUBMIT:
          case CL_PROFILING_COMMAND_START:
          case CL_PROFILING_COMMAND_END:
#if PYOPENCL_CL_VERSION >= 0x2000
          case CL_PROFILING_COMMAND_COMPLETE:
#endif
            PYOPENCL_GET_TYPED_INFO(EventProfiling, m_event, param_name,
                cl_ulong);
          default:
            throw error("Event.get_profiling_info", CL_INVALID_VALUE);
        }
      }

      virtual void wait()
      {
        PYOPENCL_CALL_GUARDED_THREADED(clWaitForEvents, (1, &m_event));
      }

      // Called from a destructor context below:
      // - Should not release the GIL
      // - Should fail gracefully in the face of errors
      virtual void wait_during_cleanup_without_releasing_the_gil()
      {
        PYOPENCL_CALL_GUARDED_CLEANUP(clWaitForEvents, (1, &m_event));
      }

#if PYOPENCL_CL_VERSION >= 0x1010
    // {{{ set_callback, by way of a a thread-based construction

    private:
      struct event_callback_info_t
      {
        std::mutex m_mutex;
        std::condition_variable m_condvar;

        // FIXME: Should implement GC traversal so that these can be collected.
        py::object m_py_event;
        py::object m_py_callback;

        bool m_set_callback_suceeded;

        bool m_notify_thread_wakeup_is_genuine;

        cl_event m_event;
        cl_int m_command_exec_status;

        event_callback_info_t(py::object py_event, py::object py_callback)
        : m_py_event(py_event), m_py_callback(py_callback), m_set_callback_suceeded(true),
        m_notify_thread_wakeup_is_genuine(false)
        {}
      };

      static void CL_CALLBACK evt_callback(cl_event evt, cl_int command_exec_status, void *user_data)
      {
        event_callback_info_t *cb_info = reinterpret_cast<event_callback_info_t *>(user_data);
        {
          std::lock_guard<std::mutex> lg(cb_info->m_mutex);
          cb_info->m_event = evt;
          cb_info->m_command_exec_status = command_exec_status;
          cb_info->m_notify_thread_wakeup_is_genuine = true;
        }

        cb_info->m_condvar.notify_one();
      }

    public:
      void set_callback(cl_int command_exec_callback_type, py::object pfn_event_notify)
      {
        // The reason for doing this via a thread is that we're able to wait on
        // acquiring the GIL. (which we can't in the callback)

        std::unique_ptr<event_callback_info_t> cb_info_holder(
            new event_callback_info_t(
              handle_from_new_ptr(new event(*this)),
              pfn_event_notify));
        event_callback_info_t *cb_info = cb_info_holder.get();

        std::thread notif_thread([cb_info]()
            {
              {
                std::unique_lock<std::mutex> ulk(cb_info->m_mutex);
                cb_info->m_condvar.wait(
                    ulk,
                    [&](){ return cb_info->m_notify_thread_wakeup_is_genuine; });

                // ulk no longer held here, cb_info ready for deletion
              }

              {
                py::gil_scoped_acquire acquire;

                if (cb_info->m_set_callback_suceeded)
                {
                  try {
                    cb_info->m_py_callback(
                        // cb_info->m_py_event,
                        cb_info->m_command_exec_status);
                  }
                  catch (std::exception &exc)
                  {
                    std::cerr
                    << "[pyopencl] event callback handler threw an exception, ignoring: "
                    << exc.what()
                    << std::endl;
                  }
                }

                // Need to hold GIL to delete py::object instances in
                // event_callback_info_t
                delete cb_info;
              }
            });
        // Thread is away--it is now its responsibility to free cb_info.
        cb_info_holder.release();

        // notif_thread should no longer be coupled to the lifetime of the thread.
        notif_thread.detach();

        try
        {
          PYOPENCL_CALL_GUARDED(clSetEventCallback, (
                data(), command_exec_callback_type, &event::evt_callback, cb_info));
        }
        catch (...) {
          // Setting the callback did not succeed. The thread would never
          // be woken up. Wake it up to let it know that it can stop.
          {
            std::lock_guard<std::mutex> lg(cb_info->m_mutex);
            cb_info->m_set_callback_suceeded = false;
            cb_info->m_notify_thread_wakeup_is_genuine = true;
          }
          cb_info->m_condvar.notify_one();
          throw;
        }
      }
      // }}}
#endif
  };

  class nanny_event : public event
  {
    // In addition to everything an event does, the nanny event holds a reference
    // to a Python object and waits for its own completion upon destruction.

    protected:
      std::unique_ptr<py_buffer_wrapper> m_ward;

    public:

      nanny_event(cl_event evt, bool retain, std::unique_ptr<py_buffer_wrapper> &ward)
        : event(evt, retain), m_ward(std::move(ward))
      { }

      ~nanny_event()
      {
        // It appears that Pybind can get very confused if we release the GIL here:
        // https://github.com/inducer/pyopencl/issues/296
        wait_during_cleanup_without_releasing_the_gil();
      }

      py::object get_ward() const
      {
        if (m_ward.get())
        {
          return py::borrow<py::object>(m_ward->m_buf.obj);
        }
        else
          return py::none();
      }

      virtual void wait()
      {
        event::wait();
        m_ward.reset();
      }

      virtual void wait_during_cleanup_without_releasing_the_gil()
      {
        event::wait_during_cleanup_without_releasing_the_gil();
        m_ward.reset();
      }
  };




  inline
  void wait_for_events(py::object events)
  {
    cl_uint num_events_in_wait_list = 0;
    std::vector<cl_event> event_wait_list(len(events));

    for (py::handle evt: events)
      event_wait_list[num_events_in_wait_list++] = py::cast<event &>(evt).data();

    PYOPENCL_CALL_GUARDED_THREADED(clWaitForEvents, (
          PYOPENCL_WAITLIST_ARGS));
  }




#if PYOPENCL_CL_VERSION >= 0x1020
  inline
  event *enqueue_marker_with_wait_list(command_queue &cq,
      py::object py_wait_for)
  {
    PYOPENCL_PARSE_WAIT_FOR;
    cl_event evt;

    PYOPENCL_CALL_GUARDED(clEnqueueMarkerWithWaitList, (
          cq.data(), PYOPENCL_WAITLIST_ARGS, &evt));

    PYOPENCL_RETURN_NEW_EVENT(evt);
  }

  inline
  event *enqueue_barrier_with_wait_list(command_queue &cq,
      py::object py_wait_for)
  {
    PYOPENCL_PARSE_WAIT_FOR;
    cl_event evt;

    PYOPENCL_CALL_GUARDED(clEnqueueBarrierWithWaitList,
        (cq.data(), PYOPENCL_WAITLIST_ARGS, &evt));

    PYOPENCL_RETURN_NEW_EVENT(evt);
  }
#endif


  // {{{ used internally for pre-OpenCL-1.2 contexts

  inline
  event *enqueue_marker(command_queue &cq)
  {
    cl_event evt;

    PYOPENCL_CALL_GUARDED(clEnqueueMarker, (
          cq.data(), &evt));

    PYOPENCL_RETURN_NEW_EVENT(evt);
  }

  inline
  void enqueue_wait_for_events(command_queue &cq, py::object py_events)
  {
    cl_uint num_events = 0;
    std::vector<cl_event> event_list(len(py_events));

    for (py::handle py_evt: py_events)
      event_list[num_events++] = py::cast<event &>(py_evt).data();

    PYOPENCL_CALL_GUARDED(clEnqueueWaitForEvents, (
          cq.data(), num_events, event_list.empty( ) ? nullptr : &event_list.front()));
  }

  inline
  void enqueue_barrier(command_queue &cq)
  {
    PYOPENCL_CALL_GUARDED(clEnqueueBarrier, (cq.data()));
  }

  // }}}


#if PYOPENCL_CL_VERSION >= 0x1010
  class user_event : public event
  {
    public:
      user_event(cl_event evt, bool retain)
        : event(evt, retain)
      { }

      void set_status(cl_int execution_status)
      {
        PYOPENCL_CALL_GUARDED(clSetUserEventStatus, (data(), execution_status));
      }
  };




  inline
  void create_user_event(user_event *self, context &ctx)
  {
    cl_int status_code;
    PYOPENCL_PRINT_CALL_TRACE("clCreateUserEvent");
    cl_event evt = clCreateUserEvent(ctx.data(), &status_code);

    if (status_code != CL_SUCCESS)
      throw pyopencl::error("UserEvent", status_code);

    try
    {
      new (self) user_event(evt, false);
    }
    catch (...)
    {
      clReleaseEvent(evt);
      throw;
    }
  }

#endif

  // }}}


  // {{{ memory_object

  py::object create_mem_object_wrapper(cl_mem mem, bool retain);

  class memory_object_holder
  {
    public:
      virtual const cl_mem data() const = 0;

      PYOPENCL_EQUALITY_TESTS(memory_object_holder);

      size_t size() const
      {
        size_t param_value;
        PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
            (data(), CL_MEM_SIZE, sizeof(param_value), &param_value, 0));
        return param_value;
      }

      py::object get_info(cl_mem_info param_name) const;

      virtual ~memory_object_holder()
      { }
  };




  class memory_object : noncopyable, public memory_object_holder
  {
    public:
      typedef std::unique_ptr<py_buffer_wrapper> hostbuf_t;

    private:
      bool m_valid;
      cl_mem m_mem;
      hostbuf_t m_hostbuf;

    public:
      memory_object(cl_mem mem, bool retain, hostbuf_t hostbuf=hostbuf_t())
        : m_valid(true), m_mem(mem)
      {
        if (retain)
          PYOPENCL_CALL_GUARDED(clRetainMemObject, (mem));

        m_hostbuf = std::move(hostbuf);
      }

      memory_object(memory_object &src)
      : m_valid(true), m_mem(src.m_mem)
      {
        PYOPENCL_CALL_GUARDED(clRetainMemObject, (m_mem));
      }

      memory_object(memory_object &&src)
      : m_valid(true), m_mem(src.m_mem), m_hostbuf(std::move(src.m_hostbuf))
      { }

      memory_object(memory_object_holder const &src)
        : m_valid(true), m_mem(src.data())
      {
        PYOPENCL_CALL_GUARDED(clRetainMemObject, (m_mem));
      }

      void release()
      {
        if (!m_valid)
            throw error("MemoryObject.free", CL_INVALID_VALUE,
                "trying to double-unref mem object");
        PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseMemObject, (m_mem));
        m_valid = false;
      }

      ~memory_object()
      {
        if (m_valid)
          release();
      }

      py::object hostbuf()
      {
        if (m_hostbuf.get())
          return py::borrow<py::object>(m_hostbuf->m_buf.obj);
        else
          return py::none();
      }

      const cl_mem data() const
      { return m_mem; }

  };

#if PYOPENCL_CL_VERSION >= 0x1020
  inline
  event *enqueue_migrate_mem_objects(
      command_queue &cq,
      py::object py_mem_objects,
      cl_mem_migration_flags flags,
      py::object py_wait_for)
  {
    PYOPENCL_PARSE_WAIT_FOR;

    std::vector<cl_mem> mem_objects;
    for (py::handle mo: py_mem_objects)
      mem_objects.push_back(py::cast<const memory_object &>(mo).data());

    cl_event evt;
    PYOPENCL_RETRY_IF_MEM_ERROR(
      PYOPENCL_CALL_GUARDED(clEnqueueMigrateMemObjects, (
            cq.data(),
            mem_objects.size(), mem_objects.empty( ) ? nullptr : &mem_objects.front(),
            flags,
            PYOPENCL_WAITLIST_ARGS, &evt
            ));
      );
    PYOPENCL_RETURN_NEW_EVENT(evt);
  }
#endif

  // }}}


  // {{{ buffer

  inline cl_mem create_buffer(
      cl_context ctx,
      cl_mem_flags flags,
      size_t size,
      void *host_ptr)
  {
    cl_int status_code;
    PYOPENCL_PRINT_CALL_TRACE("clCreateBuffer");
    cl_mem mem = clCreateBuffer(ctx, flags, size, host_ptr, &status_code);

    if (status_code != CL_SUCCESS)
      throw pyopencl::error("create_buffer", status_code);

    return mem;
  }




  inline cl_mem create_buffer_gc(
      cl_context ctx,
      cl_mem_flags flags,
      size_t size,
      void *host_ptr)
  {
    PYOPENCL_RETRY_RETURN_IF_MEM_ERROR(
      return create_buffer(ctx, flags, size, host_ptr);
    );
  }



#if PYOPENCL_CL_VERSION >= 0x1010
  inline cl_mem create_sub_buffer(
      cl_mem buffer, cl_mem_flags flags, cl_buffer_create_type bct,
      const void *buffer_create_info)
  {
    cl_int status_code;
    PYOPENCL_PRINT_CALL_TRACE("clCreateSubBuffer");
    cl_mem mem = clCreateSubBuffer(buffer, flags,
        bct, buffer_create_info, &status_code);

    if (status_code != CL_SUCCESS)
      throw pyopencl::error("clCreateSubBuffer", status_code);

    return mem;
  }




  inline cl_mem create_sub_buffer_gc(
      cl_mem buffer, cl_mem_flags flags, cl_buffer_create_type bct,
      const void *buffer_create_info)
  {
    PYOPENCL_RETRY_RETURN_IF_MEM_ERROR(
      return create_sub_buffer(buffer, flags, bct, buffer_create_info);
    );
  }
#endif



  class buffer : public memory_object
  {
    public:
      buffer(cl_mem mem, bool retain, hostbuf_t hostbuf=hostbuf_t())
        : memory_object(mem, retain, std::move(hostbuf))
      { }

#if PYOPENCL_CL_VERSION >= 0x1010
      buffer *get_sub_region(
          size_t origin, size_t size, cl_mem_flags flags) const
      {
        cl_buffer_region region = { origin, size};

        cl_mem mem = create_sub_buffer_gc(
            data(), flags, CL_BUFFER_CREATE_TYPE_REGION, &region);

        try
        {
          return new buffer(mem, false);
        }
        catch (...)
        {
          PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem));
          throw;
        }
      }

      buffer *getitem(py::object slc) const
      {
        PYOPENCL_BUFFER_SIZE_T start, end, stride, length;

        if (!PySlice_Check(slc.ptr()))
          throw pyopencl::error("Buffer.__getitem__", CL_INVALID_VALUE,
              "Buffer slice must be a slice object");

        size_t my_length;
        PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
            (data(), CL_MEM_SIZE, sizeof(my_length), &my_length, 0));

        if (PySlice_GetIndicesEx(slc.ptr(),
              my_length, &start, &end, &stride, &length) != 0)
          throw py::python_error();

        if (stride != 1)
          throw pyopencl::error("Buffer.__getitem__", CL_INVALID_VALUE,
              "Buffer slice must have stride 1");

        cl_mem_flags my_flags;
        PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
            (data(), CL_MEM_FLAGS, sizeof(my_flags), &my_flags, 0));

        my_flags &= ~CL_MEM_COPY_HOST_PTR;

        if (end <= start)
          throw pyopencl::error("Buffer.__getitem__", CL_INVALID_VALUE,
              "Buffer slice have end > start");

        return get_sub_region(start, end-start, my_flags);
      }
#endif
  };

  // {{{ buffer creation

  inline void create_buffer_py(
      buffer *self,
      context &ctx,
      cl_mem_flags flags,
      size_t size,
      py::object py_hostbuf
      )
  {
    if (py_hostbuf.ptr() != Py_None &&
        !(flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)))
      PyErr_Warn(PyExc_UserWarning, "'hostbuf' was passed, "
          "but no memory flags to make use of it.");

    void *buf = 0;

    std::unique_ptr<py_buffer_wrapper> retained_buf_obj;
    if (py_hostbuf.ptr() != Py_None)
    {
      retained_buf_obj = std::unique_ptr<py_buffer_wrapper>(new py_buffer_wrapper);

      int py_buf_flags = PyBUF_ANY_CONTIGUOUS;
      if ((flags & CL_MEM_USE_HOST_PTR)
          && ((flags & CL_MEM_READ_WRITE)
            || (flags & CL_MEM_WRITE_ONLY)))
        py_buf_flags |= PyBUF_WRITABLE;

      retained_buf_obj->get(py_hostbuf.ptr(), py_buf_flags);

      buf = retained_buf_obj->m_buf.buf;

      if (size > size_t(retained_buf_obj->m_buf.len))
        throw pyopencl::error("Buffer", CL_INVALID_VALUE,
            "specified size is greater than host buffer size");
      if (size == 0)
        size = retained_buf_obj->m_buf.len;
    }

    cl_mem mem = create_buffer_gc(ctx.data(), flags, size, buf);

    if (!(flags & CL_MEM_USE_HOST_PTR))
      retained_buf_obj.reset();

    try
    {
      new (self) buffer(mem, false, std::move(retained_buf_obj));
    }
    catch (...)
    {
      PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem));
      throw;
    }
  }

  // }}}

  // {{{ buffer transfers

  // {{{ byte-for-byte transfers

  inline
  event *enqueue_read_buffer(
      command_queue &cq,
      memory_object_holder &mem,
      py::object buffer,
      size_t src_offset,
      py::object py_wait_for,
      bool is_blocking)
  {
    PYOPENCL_PARSE_WAIT_FOR;

    void *buf;
    PYOPENCL_BUFFER_SIZE_T len;

    std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);

    ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE);

    buf = ward->m_buf.buf;
    len = ward->m_buf.len;

    cl_command_queue queue = cq.data();

    cl_event evt;
    PYOPENCL_RETRY_IF_MEM_ERROR(
      PYOPENCL_CALL_GUARDED_THREADED(clEnqueueReadBuffer, (
            queue,
            mem.data(),
            PYOPENCL_CAST_BOOL(is_blocking),
            src_offset, len, buf,
            PYOPENCL_WAITLIST_ARGS, &evt
            ))
      );
    PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, ward);
  }




  inline
  event *enqueue_write_buffer(
      command_queue &cq,
      memory_object_holder &mem,
      py::object buffer,
      size_t dst_offset,
      py::object py_wait_for,
      bool is_blocking)
  {
    PYOPENCL_PARSE_WAIT_FOR;

    const void *buf;
    PYOPENCL_BUFFER_SIZE_T len;

    std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);

    ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS);

    buf = ward->m_buf.buf;
    len = ward->m_buf.len;

    cl_command_queue queue = cq.data();

    cl_event evt;
    PYOPENCL_RETRY_IF_MEM_ERROR(
      PYOPENCL_CALL_GUARDED_THREADED(clEnqueueWriteBuffer, (
            queue,
            mem.data(),
            PYOPENCL_CAST_BOOL(is_blocking),
            dst_offset, len, buf,
            PYOPENCL_WAITLIST_ARGS, &evt
            ))
      );
    PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, ward);
  }




  inline
  event *enqueue_copy_buffer(
      command_queue &cq,
      memory_object_holder &src,
      memory_object_holder &dst,
      ptrdiff_t byte_count,
      size_t src_offset,
      size_t dst_offset,
      py::object py_wait_for)
  {
    PYOPENCL_PARSE_WAIT_FOR;

    if (byte_count < 0)
    {
      size_t byte_count_src = 0;
      size_t byte_count_dst = 0;
      PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
          (src.data(), CL_MEM_SIZE, sizeof(byte_count), &byte_count_src, 0));
      PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
          (src.data(), CL_MEM_SIZE, sizeof(byte_count), &byte_count_dst, 0));
      byte_count = std::min(byte_count_src, byte_count_dst);
    }

    cl_event evt;
    PYOPENCL_RETRY_IF_MEM_ERROR(
      PYOPENCL_CALL_GUARDED(clEnqueueCopyBuffer, (
            cq.data(),
            src.data(), dst.data(),
            src_offset, dst_offset,
            byte_count,
            PYOPENCL_WAITLIST_ARGS,
            &evt
            ))
      );

    PYOPENCL_RETURN_NEW_EVENT(evt);
  }

#ifdef CL_DEVICE_P2P_DEVICES_AMD
  inline
  event *enqueue_copy_buffer_p2p_amd(
      platform &plat,
      command_queue &cq,
      memory_object_holder &src,
      memory_object_holder &dst,
      py::object py_byte_count,
      py::object py_wait_for)
  {
    PYOPENCL_PARSE_WAIT_FOR;

    ptrdiff_t byte_count = 0;
    if (py_byte_count.ptr() == Py_None)
    {
      size_t byte_count_src = 0;
      size_t byte_count_dst = 0;
      PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
          (src.data(), CL_MEM_SIZE, sizeof(byte_count), &byte_count_src, 0));
      PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
          (dst.data(), CL_MEM_SIZE, sizeof(byte_count), &byte_count_dst, 0));
      byte_count = std::min(byte_count_src, byte_count_dst);
    }
    else
    {
      byte_count = py::cast<ptrdiff_t>(py_byte_count);
    }

    clEnqueueCopyBufferP2PAMD_fn fn = (clEnqueueCopyBufferP2PAMD_fn)clGetExtensionFunctionAddressForPlatform(plat.data(), "clEnqueueCopyBufferP2PAMD");
    if (!fn)
      throw pyopencl::error("clGetExtensionFunctionAddressForPlatform", CL_INVALID_VALUE,
          "clEnqueueCopyBufferP2PAMD is not available");

    cl_event evt;
    PYOPENCL_RETRY_IF_MEM_ERROR(
      PYOPENCL_CALL_GUARDED(fn, (
        cq.data(),
        src.data(), dst.data(),
        0, 0,
        byte_count,
        PYOPENCL_WAITLIST_ARGS,
        &evt
        ))
      );

    PYOPENCL_RETURN_NEW_EVENT(evt);
  }
#endif

  // }}}

  // {{{ rectangular transfers
#if PYOPENCL_CL_VERSION >= 0x1010
  inline
  event *enqueue_read_buffer_rect(
      command_queue &cq,
      memory_object_holder &mem,
      py::object buffer,
      py::object py_buffer_origin,
      py::object py_host_origin,
      py::object py_region,
      py::object py_buffer_pitches,
      py::object py_host_pitches,
      py::object py_wait_for,
      bool is_blocking
      )
  {
    PYOPENCL_PARSE_WAIT_FOR;
    COPY_PY_COORD_TRIPLE(buffer_origin);
    COPY_PY_COORD_TRIPLE(host_origin);
    COPY_PY_REGION_TRIPLE(region);
    COPY_PY_PITCH_TUPLE(buffer_pitches);
    COPY_PY_PITCH_TUPLE(host_pitches);

    void *buf;

    std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);

    ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE);

    buf = ward->m_buf.buf;

    cl_command_queue queue = cq.data();

    cl_event evt;
    PYOPENCL_RETRY_IF_MEM_ERROR(
      PYOPENCL_CALL_GUARDED_THREADED(clEnqueueReadBufferRect, (
            queue,
            mem.data(),
            PYOPENCL_CAST_BOOL(is_blocking),
            buffer_origin, host_origin, region,
            buffer_pitches[0], buffer_pitches[1],
            host_pitches[0], host_pitches[1],
            buf,
            PYOPENCL_WAITLIST_ARGS, &evt
            ))
      );
    PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, ward);
  }




  inline
  event *enqueue_write_buffer_rect(
      command_queue &cq,
      memory_object_holder &mem,
      py::object buffer,
      py::object py_buffer_origin,
      py::object py_host_origin,
      py::object py_region,
      py::object py_buffer_pitches,
      py::object py_host_pitches,
      py::object py_wait_for,
      bool is_blocking
      )
  {
    PYOPENCL_PARSE_WAIT_FOR;
    COPY_PY_COORD_TRIPLE(buffer_origin);
    COPY_PY_COORD_TRIPLE(host_origin);
    COPY_PY_REGION_TRIPLE(region);
    COPY_PY_PITCH_TUPLE(buffer_pitches);
    COPY_PY_PITCH_TUPLE(host_pitches);

    const void *buf;

    std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);

    ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS);

    buf = ward->m_buf.buf;

    cl_command_queue queue = cq.data();

    cl_event evt;
    PYOPENCL_RETRY_IF_MEM_ERROR(
      PYOPENCL_CALL_GUARDED_THREADED(clEnqueueWriteBufferRect, (
            queue,
            mem.data(),
            PYOPENCL_CAST_BOOL(is_blocking),
            buffer_origin, host_origin, region,
            buffer_pitches[0], buffer_pitches[1],
            host_pitches[0], host_pitches[1],
            buf,
            PYOPENCL_WAITLIST_ARGS, &evt
            ))
      );
    PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, ward);
  }




  inline
  event *enqueue_copy_buffer_rect(
      command_queue &cq,
      memory_object_holder &src,
      memory_object_holder &dst,
      py::object py_src_origin,
      py::object py_dst_origin,
      py::object py_region,
      py::object py_src_pitches,
      py::object py_dst_pitches,
      py::object py_wait_for)
  {
    PYOPENCL_PARSE_WAIT_FOR;
    COPY_PY_COORD_TRIPLE(src_origin);
    COPY_PY_COORD_TRIPLE(dst_origin);
    COPY_PY_REGION_TRIPLE(region);
    COPY_PY_PITCH_TUPLE(src_pitches);
    COPY_PY_PITCH_TUPLE(dst_pitches);

    cl_event evt;
    PYOPENCL_RETRY_IF_MEM_ERROR(
      PYOPENCL_CALL_GUARDED(clEnqueueCopyBufferRect, (
            cq.data(),
            src.data(), dst.data(),
            src_origin, dst_origin, region,
            src_pitches[0], src_pitches[1],
            dst_pitches[0], dst_pitches[1],
            PYOPENCL_WAITLIST_ARGS,
            &evt
            ))
      );

    PYOPENCL_RETURN_NEW_EVENT(evt);
  }

#endif

  // }}}

  // }}}

#if PYOPENCL_CL_VERSION >= 0x1020
  inline
  event *enqueue_fill_buffer(
      command_queue &cq,
      memory_object_holder &mem,
      py::object pattern,
      size_t offset,
      size_t size,
      py::object py_wait_for
      )
  {
    PYOPENCL_PARSE_WAIT_FOR;

    const void *pattern_buf;
    PYOPENCL_BUFFER_SIZE_T pattern_len;

    std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);

    ward->get(pattern.ptr(), PyBUF_ANY_CONTIGUOUS);

    pattern_buf = ward->m_buf.buf;
    pattern_len = ward->m_buf.len;

    cl_event evt;
    PYOPENCL_RETRY_IF_MEM_ERROR(
      PYOPENCL_CALL_GUARDED(clEnqueueFillBuffer, (
            cq.data(),
            mem.data(),
            pattern_buf, pattern_len, offset, size,
            PYOPENCL_WAITLIST_ARGS, &evt
            ))
      );
    PYOPENCL_RETURN_NEW_EVENT(evt);
  }
#endif

  // }}}


  // {{{ image

  class image : public memory_object
  {
    public:
      image(cl_mem mem, bool retain, hostbuf_t hostbuf=hostbuf_t())
        : memory_object(mem, retain, std::move(hostbuf))
      { }

      py::object get_image_info(cl_image_info param_name) const
      {
        switch (param_name)
        {
          case CL_IMAGE_FORMAT:
            PYOPENCL_GET_TYPED_INFO(Image, data(), param_name,
                cl_image_format);
          case CL_IMAGE_ELEMENT_SIZE:
          case CL_IMAGE_ROW_PITCH:
          case CL_IMAGE_SLICE_PITCH:
          case CL_IMAGE_WIDTH:
          case CL_IMAGE_HEIGHT:
          case CL_IMAGE_DEPTH:
#if PYOPENCL_CL_VERSION >= 0x1020
          case CL_IMAGE_ARRAY_SIZE:
#endif
            PYOPENCL_GET_TYPED_INFO(Image, data(), param_name, size_t);

#if PYOPENCL_CL_VERSION >= 0x1020
          case CL_IMAGE_BUFFER:
            {
              cl_mem param_value;
              PYOPENCL_CALL_GUARDED(clGetImageInfo, \
                  (data(), param_name, sizeof(param_value), &param_value, 0));
              if (param_value == 0)
              {
                // no associated memory object? no problem.
                return py::none();
              }

              return create_mem_object_wrapper(param_value, /* retain */ true);
            }

          case CL_IMAGE_NUM_MIP_LEVELS:
          case CL_IMAGE_NUM_SAMPLES:
            PYOPENCL_GET_TYPED_INFO(Image, data(), param_name, cl_uint);
#endif

          default:
            throw error("Image.get_image_info", CL_INVALID_VALUE);
        }
      }
  };




  // {{{ image formats

  inline
  void set_image_format(cl_image_format *self, cl_channel_order ord, cl_channel_type tp)
  {
    self->image_channel_order = ord;
    self->image_channel_data_type = tp;
  }

  inline
  py::list get_supported_image_formats(
      context const &ctx,
      cl_mem_flags flags,
      cl_mem_object_type image_type)
  {
    cl_uint num_image_formats;
    PYOPENCL_CALL_GUARDED(clGetSupportedImageFormats, (
          ctx.data(), flags, image_type,
          0, nullptr, &num_image_formats));

    std::vector<cl_image_format> formats(num_image_formats);
    PYOPENCL_CALL_GUARDED(clGetSupportedImageFormats, (
          ctx.data(), flags, image_type,
          formats.size(), formats.empty( ) ? nullptr : &formats.front(), nullptr));

    PYOPENCL_RETURN_VECTOR(cl_image_format, formats);
  }

  inline
  cl_uint get_image_format_channel_count(cl_image_format const &fmt)
  {
    switch (fmt.image_channel_order)
    {
      case CL_R: return 1;
      case CL_A: return 1;
      case CL_RG: return 2;
      case CL_RA: return 2;
      case CL_RGB: return 3;
      case CL_RGBA: return 4;
      case CL_BGRA: return 4;
      case CL_INTENSITY: return 1;
      case CL_LUMINANCE: return 1;
      default:
        throw pyopencl::error("ImageFormat.channel_dtype_size",
            CL_INVALID_VALUE,
            "unrecognized channel order");
    }
  }

  inline
  cl_uint get_image_format_channel_dtype_size(cl_image_format const &fmt)
  {
    switch (fmt.image_channel_data_type)
    {
      case CL_SNORM_INT8: return 1;
      case CL_SNORM_INT16: return 2;
      case CL_UNORM_INT8: return 1;
      case CL_UNORM_INT16: return 2;
      case CL_UNORM_SHORT_565: return 2;
      case CL_UNORM_SHORT_555: return 2;
      case CL_UNORM_INT_101010: return 4;
      case CL_SIGNED_INT8: return 1;
      case CL_SIGNED_INT16: return 2;
      case CL_SIGNED_INT32: return 4;
      case CL_UNSIGNED_INT8: return 1;
      case CL_UNSIGNED_INT16: return 2;
      case CL_UNSIGNED_INT32: return 4;
      case CL_HALF_FLOAT: return 2;
      case CL_FLOAT: return 4;
      default:
        throw pyopencl::error("ImageFormat.channel_dtype_size",
            CL_INVALID_VALUE,
            "unrecognized channel data type");
    }
  }

  inline
  cl_uint get_image_format_item_size(cl_image_format const &fmt)
  {
    return get_image_format_channel_count(fmt)
      * get_image_format_channel_dtype_size(fmt);
  }

  // }}}

  // {{{ image creation

  inline
  void create_image(
      image *self,
      context const &ctx,
      cl_mem_flags flags,
      cl_image_format const &fmt,
      py::sequence shape,
      py::sequence pitches,
      py::object buffer)
  {
    if (shape.ptr() == Py_None)
      throw pyopencl::error("Image", CL_INVALID_VALUE,
          "'shape' must be given");

    void *buf = 0;
    PYOPENCL_BUFFER_SIZE_T len = 0;

    std::unique_ptr<py_buffer_wrapper> retained_buf_obj;
    if (buffer.ptr() != Py_None)
    {
      retained_buf_obj = std::unique_ptr<py_buffer_wrapper>(new py_buffer_wrapper);

      int py_buf_flags = PyBUF_ANY_CONTIGUOUS;
      if ((flags & CL_MEM_USE_HOST_PTR)
          && ((flags & CL_MEM_READ_WRITE)
            || (flags & CL_MEM_WRITE_ONLY)))
        py_buf_flags |= PyBUF_WRITABLE;

      retained_buf_obj->get(buffer.ptr(), py_buf_flags);

      buf = retained_buf_obj->m_buf.buf;
      len = retained_buf_obj->m_buf.len;
    }

    unsigned dims = py::len(shape);
    cl_int status_code;
    cl_mem mem;
    if (dims == 2)
    {
      size_t width = py::cast<size_t>(shape[0]);
      size_t height = py::cast<size_t>(shape[1]);

      size_t pitch = 0;
      if (pitches.ptr() != Py_None)
      {
        if (py::len(pitches) != 1)
          throw pyopencl::error("Image", CL_INVALID_VALUE,
              "invalid length of pitch tuple");
        pitch = py::cast<size_t>(pitches[0]);
      }

      // check buffer size
      cl_int itemsize = get_image_format_item_size(fmt);
      if (buf && std::max(pitch, width*itemsize)*height > cl_uint(len))
          throw pyopencl::error("Image", CL_INVALID_VALUE,
              "buffer too small");

      PYOPENCL_PRINT_CALL_TRACE("clCreateImage2D");
      PYOPENCL_RETRY_IF_MEM_ERROR(
          {
            mem = clCreateImage2D(ctx.data(), flags, &fmt,
                width, height, pitch, buf, &status_code);
            if (status_code != CL_SUCCESS)
              throw pyopencl::error("clCreateImage2D", status_code);
          } );

    }
    else if (dims == 3)
    {
      size_t width = py::cast<size_t>(shape[0]);
      size_t height = py::cast<size_t>(shape[1]);
      size_t depth = py::cast<size_t>(shape[2]);

      size_t pitch_x = 0;
      size_t pitch_y = 0;

      if (pitches.ptr() != Py_None)
      {
        if (py::len(pitches) != 2)
          throw pyopencl::error("Image", CL_INVALID_VALUE,
              "invalid length of pitch tuple");

        pitch_x = py::cast<size_t>(pitches[0]);
        pitch_y = py::cast<size_t>(pitches[1]);
      }

      // check buffer size
      cl_int itemsize = get_image_format_item_size(fmt);
      if (buf &&
          std::max(std::max(pitch_x, width*itemsize)*height, pitch_y)
          * depth > cl_uint(len))
        throw pyopencl::error("Image", CL_INVALID_VALUE,
            "buffer too small");

      PYOPENCL_PRINT_CALL_TRACE("clCreateImage3D");
      PYOPENCL_RETRY_IF_MEM_ERROR(
          {
            mem = clCreateImage3D(ctx.data(), flags, &fmt,
              width, height, depth, pitch_x, pitch_y, buf, &status_code);
            if (status_code != CL_SUCCESS)
              throw pyopencl::error("clCreateImage3D", status_code);
          } );
    }
    else
      throw pyopencl::error("Image", CL_INVALID_VALUE,
          "invalid dimension");

    if (!(flags & CL_MEM_USE_HOST_PTR))
      retained_buf_obj.reset();

    try
    {
      new (self) image(mem, false, std::move(retained_buf_obj));
    }
    catch (...)
    {
      PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem));
      throw;
    }
  }

#if PYOPENCL_CL_VERSION >= 0x1020

  inline
  void create_image_from_desc(
      image *self,
      context const &ctx,
      cl_mem_flags flags,
      cl_image_format const &fmt,
      cl_image_desc &desc,
      py::object buffer)
  {
    if (buffer.ptr() != Py_None &&
        !(flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)))
      PyErr_Warn(PyExc_UserWarning, "'hostbuf' was passed, "
          "but no memory flags to make use of it.");

    void *buf = 0;

    std::unique_ptr<py_buffer_wrapper> retained_buf_obj;
    if (buffer.ptr() != Py_None)
    {
      retained_buf_obj = std::unique_ptr<py_buffer_wrapper>(new py_buffer_wrapper);

      int py_buf_flags = PyBUF_ANY_CONTIGUOUS;
      if ((flags & CL_MEM_USE_HOST_PTR)
          && ((flags & CL_MEM_READ_WRITE)
            || (flags & CL_MEM_WRITE_ONLY)))
        py_buf_flags |= PyBUF_WRITABLE;

      retained_buf_obj->get(buffer.ptr(), py_buf_flags);

      buf = retained_buf_obj->m_buf.buf;
    }

    PYOPENCL_PRINT_CALL_TRACE("clCreateImage");
    cl_int status_code;
    cl_mem mem = clCreateImage(ctx.data(), flags, &fmt, &desc, buf, &status_code);
    if (status_code != CL_SUCCESS)
      throw pyopencl::error("clCreateImage", status_code);

    if (!(flags & CL_MEM_USE_HOST_PTR))
      retained_buf_obj.reset();

    try
    {
      new (self) image(mem, false, std::move(retained_buf_obj));
    }
    catch (...)
    {
      PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem));
      throw;
    }
  }

#endif

  // }}}

  // {{{ image transfers

  inline
  event *enqueue_read_image(
      command_queue &cq,
      image &img,
      py::object py_origin, py::object py_region,
      py::object buffer,
      size_t row_pitch, size_t slice_pitch,
      py::object py_wait_for,
      bool is_blocking)
  {
    PYOPENCL_PARSE_WAIT_FOR;
    COPY_PY_COORD_TRIPLE(origin);
    COPY_PY_REGION_TRIPLE(region);

    void *buf;

    std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);

    ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE);

    buf = ward->m_buf.buf;

    cl_event evt;

    PYOPENCL_RETRY_IF_MEM_ERROR(
      PYOPENCL_CALL_GUARDED(clEnqueueReadImage, (
            cq.data(),
            img.data(),
            PYOPENCL_CAST_BOOL(is_blocking),
            origin, region, row_pitch, slice_pitch, buf,
            PYOPENCL_WAITLIST_ARGS, &evt
            ));
      );
    PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, ward);
  }




  inline
  event *enqueue_write_image(
      command_queue &cq,
      image &img,
      py::object py_origin, py::object py_region,
      py::object buffer,
      size_t row_pitch, size_t slice_pitch,
      py::object py_wait_for,
      bool is_blocking)
  {
    PYOPENCL_PARSE_WAIT_FOR;
    COPY_PY_COORD_TRIPLE(origin);
    COPY_PY_REGION_TRIPLE(region);

    const void *buf;

    std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);

    ward->get(buffer.ptr(), PyBUF_ANY_CONTIGUOUS);

    buf = ward->m_buf.buf;

    cl_event evt;
    PYOPENCL_RETRY_IF_MEM_ERROR(
      PYOPENCL_CALL_GUARDED(clEnqueueWriteImage, (
            cq.data(),
            img.data(),
            PYOPENCL_CAST_BOOL(is_blocking),
            origin, region, row_pitch, slice_pitch, buf,
            PYOPENCL_WAITLIST_ARGS, &evt
            ));
      );
    PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, ward);
  }




  inline
  event *enqueue_copy_image(
      command_queue &cq,
      memory_object_holder &src,
      memory_object_holder &dest,
      py::object py_src_origin,
      py::object py_dest_origin,
      py::object py_region,
      py::object py_wait_for
      )
  {
    PYOPENCL_PARSE_WAIT_FOR;
    COPY_PY_COORD_TRIPLE(src_origin);
    COPY_PY_COORD_TRIPLE(dest_origin);
    COPY_PY_REGION_TRIPLE(region);

    cl_event evt;
    PYOPENCL_RETRY_IF_MEM_ERROR(
      PYOPENCL_CALL_GUARDED(clEnqueueCopyImage, (
            cq.data(), src.data(), dest.data(),
            src_origin, dest_origin, region,
            PYOPENCL_WAITLIST_ARGS, &evt
            ));
      );
    PYOPENCL_RETURN_NEW_EVENT(evt);
  }




  inline
  event *enqueue_copy_image_to_buffer(
      command_queue &cq,
      memory_object_holder &src,
      memory_object_holder &dest,
      py::object py_origin,
      py::object py_region,
      size_t offset,
      py::object py_wait_for
      )
  {
    PYOPENCL_PARSE_WAIT_FOR;
    COPY_PY_COORD_TRIPLE(origin);
    COPY_PY_REGION_TRIPLE(region);

    cl_event evt;
    PYOPENCL_RETRY_IF_MEM_ERROR(
      PYOPENCL_CALL_GUARDED(clEnqueueCopyImageToBuffer, (
            cq.data(), src.data(), dest.data(),
            origin, region, offset,
            PYOPENCL_WAITLIST_ARGS, &evt
            ));
      );
    PYOPENCL_RETURN_NEW_EVENT(evt);
  }




  inline
  event *enqueue_copy_buffer_to_image(
      command_queue &cq,
      memory_object_holder &src,
      memory_object_holder &dest,
      size_t offset,
      py::object py_origin,
      py::object py_region,
      py::object py_wait_for
      )
  {
    PYOPENCL_PARSE_WAIT_FOR;
    COPY_PY_COORD_TRIPLE(origin);
    COPY_PY_REGION_TRIPLE(region);

    cl_event evt;
    PYOPENCL_RETRY_IF_MEM_ERROR(
      PYOPENCL_CALL_GUARDED(clEnqueueCopyBufferToImage, (
            cq.data(), src.data(), dest.data(),
            offset, origin, region,
            PYOPENCL_WAITLIST_ARGS, &evt
            ));
      );
    PYOPENCL_RETURN_NEW_EVENT(evt);
  }

  // }}}

#if PYOPENCL_CL_VERSION >= 0x1020
  inline
  event *enqueue_fill_image(
      command_queue &cq,
      memory_object_holder &mem,
      py::object color,
      py::object py_origin, py::object py_region,
      py::object py_wait_for
      )
  {
    PYOPENCL_PARSE_WAIT_FOR;

    COPY_PY_COORD_TRIPLE(origin);
    COPY_PY_REGION_TRIPLE(region);

    const void *color_buf;

    std::unique_ptr<py_buffer_wrapper> ward(new py_buffer_wrapper);

    ward->get(color.ptr(), PyBUF_ANY_CONTIGUOUS);

    color_buf = ward->m_buf.buf;

    cl_event evt;
    PYOPENCL_RETRY_IF_MEM_ERROR(
      PYOPENCL_CALL_GUARDED(clEnqueueFillImage, (
            cq.data(),
            mem.data(),
            color_buf, origin, region,
            PYOPENCL_WAITLIST_ARGS, &evt
            ));
      );
    PYOPENCL_RETURN_NEW_EVENT(evt);
  }
#endif

  // }}}


  // {{{ pipe

  class pipe : public memory_object
  {
    public:
      pipe(cl_mem mem, bool retain)
        : memory_object(mem, retain)
      { }

#if PYOPENCL_CL_VERSION < 0x2000
      typedef void* cl_pipe_info;
#endif

      py::object get_pipe_info(cl_pipe_info param_name) const
      {
#if PYOPENCL_CL_VERSION >= 0x2000
        switch (param_name)
        {
          case CL_PIPE_PACKET_SIZE:
          case CL_PIPE_MAX_PACKETS:
            PYOPENCL_GET_TYPED_INFO(Pipe, data(), param_name, cl_uint);

          default:
            throw error("Pipe.get_pipe_info", CL_INVALID_VALUE);
        }
#else
        throw error("Pipes not available. PyOpenCL was not compiled against a CL2+ header.",
            CL_INVALID_VALUE);
#endif
      }
  };

#if PYOPENCL_CL_VERSION >= 0x2000
  inline
  void create_pipe(
      pipe *self,
      context const &ctx,
      cl_mem_flags flags,
      cl_uint pipe_packet_size,
      cl_uint pipe_max_packets,
      py::sequence py_props)
  {
#if 0
    PYOPENCL_STACK_CONTAINER(cl_pipe_properties, props, py::len(py_props) + 1);
    {
      size_t i = 0;
      for (auto prop: py_props)
        props[i++] = py::cast<cl_pipe_properties>(prop);
      props[i++] = 0;
    }
#endif
    if (py::len(py_props) != 0)
      throw pyopencl::error("Pipe", CL_INVALID_VALUE, "non-empty properties "
          "argument to Pipe not allowed");

    cl_int status_code;
    PYOPENCL_PRINT_CALL_TRACE("clCreatePipe");

    cl_mem mem = clCreatePipe(
        ctx.data(),
        flags,
        pipe_packet_size,
        pipe_max_packets,
        nullptr,
        &status_code);

    if (status_code != CL_SUCCESS)
      throw pyopencl::error("Pipe", status_code);

    try
    {
      new (self) pipe(mem, false);
    }
    catch (...)
    {
      PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem));
      throw;
    }
}
#endif

  // }}}


  // {{{ maps
  class memory_map
  {
    private:
      bool m_valid;
      std::shared_ptr<command_queue> m_queue;
      memory_object m_mem;
      void *m_ptr;

    public:
      memory_map(std::shared_ptr<command_queue> cq, memory_object const &mem, void *ptr)
        : m_valid(true), m_queue(cq), m_mem(mem), m_ptr(ptr)
      {
      }

      ~memory_map()
      {
        if (m_valid)
          delete release(0, py::none());
      }

      event *release(command_queue *cq, py::object py_wait_for)
      {
        PYOPENCL_PARSE_WAIT_FOR;

        if (cq == 0)
          cq = m_queue.get();

        cl_event evt;
        PYOPENCL_CALL_GUARDED(clEnqueueUnmapMemObject, (
              cq->data(), m_mem.data(), m_ptr,
              PYOPENCL_WAITLIST_ARGS, &evt
              ));

        m_valid = false;

        PYOPENCL_RETURN_NEW_EVENT(evt);
      }
  };




  // FIXME: Reenable in pypy
#ifndef PYPY_VERSION
  inline
  py::object enqueue_map_buffer(
      std::shared_ptr<command_queue> cq,
      memory_object_holder &buf,
      cl_map_flags flags,
      size_t offset,
      py::object py_shape, py::object dtype,
      py::object py_order, py::object py_strides,
      py::object py_wait_for,
      bool is_blocking
      )
  {
    PYOPENCL_PARSE_WAIT_FOR;
    PYOPENCL_PARSE_NUMPY_ARRAY_SPEC;

    npy_uintp size_in_bytes = PyDataType_ELSIZE(tp_descr);
    for (npy_intp sdim: shape)
      size_in_bytes *= sdim;

    py::object result;
    PyArrayObject *result_arr;

    cl_event evt;
    cl_int status_code;
    PYOPENCL_PRINT_CALL_TRACE("clEnqueueMapBuffer");
    void *mapped;

    PYOPENCL_RETRY_IF_MEM_ERROR(
        {
          {
            py::gil_scoped_release release;
            mapped = clEnqueueMapBuffer(
                  cq->data(), buf.data(),
                  PYOPENCL_CAST_BOOL(is_blocking), flags,
                  offset, size_in_bytes,
                  PYOPENCL_WAITLIST_ARGS, &evt,
                  &status_code);
          }
          if (status_code != CL_SUCCESS)
            throw pyopencl::error("clEnqueueMapBuffer", status_code);
        } );

    event evt_handle(evt, false);

    std::unique_ptr<memory_map> map;

    try
    {
      result = py::object(py::steal<py::object>(PyArray_NewFromDescr(
          &PyArray_Type, tp_descr,
          shape.size(),
          shape.empty() ? nullptr : &shape.front(),
          strides.empty() ? nullptr : &strides.front(),
          mapped, ary_flags, /*obj*/nullptr)));

      result_arr = (PyArrayObject *) result.ptr();
      if (size_in_bytes != (npy_uintp) PyArray_NBYTES(result_arr))
        throw pyopencl::error("enqueue_map_buffer", CL_INVALID_VALUE,
            "miscalculated numpy array size (not contiguous?)");

       map = std::unique_ptr<memory_map>(new memory_map(cq, buf, mapped));
    }
    catch (...)
    {
      PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueUnmapMemObject, (
            cq->data(), buf.data(), mapped, 0, 0, 0));
      throw;
    }

    py::object map_py(handle_from_new_ptr(map.release()));
    PyArray_SetBaseObject(result_arr, map_py.ptr());
    Py_INCREF(map_py.ptr());

    return py::make_tuple(
        result,
        handle_from_new_ptr(new event(evt_handle)));
  }
#endif




  // FIXME: Reenable in pypy
#ifndef PYPY_VERSION
  inline
  py::object enqueue_map_image(
      std::shared_ptr<command_queue> cq,
      memory_object_holder &img,
      cl_map_flags flags,
      py::object py_origin,
      py::object py_region,
      py::object py_shape, py::object dtype,
      py::object py_order, py::object py_strides,
      py::object py_wait_for,
      bool is_blocking
      )
  {
    PYOPENCL_PARSE_WAIT_FOR;
    PYOPENCL_PARSE_NUMPY_ARRAY_SPEC;
    COPY_PY_COORD_TRIPLE(origin);
    COPY_PY_REGION_TRIPLE(region);

    cl_event evt;
    cl_int status_code;
    PYOPENCL_PRINT_CALL_TRACE("clEnqueueMapImage");
    size_t row_pitch, slice_pitch;
    void *mapped;
    PYOPENCL_RETRY_IF_MEM_ERROR(
      {
        {
          py::gil_scoped_release release;
          mapped = clEnqueueMapImage(
                cq->data(), img.data(),
                PYOPENCL_CAST_BOOL(is_blocking), flags,
                origin, region, &row_pitch, &slice_pitch,
                PYOPENCL_WAITLIST_ARGS, &evt,
                &status_code);
        }
        if (status_code != CL_SUCCESS)
          throw pyopencl::error("clEnqueueMapImage", status_code);
      } );

    event evt_handle(evt, false);

    std::unique_ptr<memory_map> map;
    try
    {
       map = std::unique_ptr<memory_map>(new memory_map(cq, img, mapped));
    }
    catch (...)
    {
      PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueUnmapMemObject, (
            cq->data(), img.data(), mapped, 0, 0, 0));
      throw;
    }

    py::object result = py::steal<py::object>(PyArray_NewFromDescr(
        &PyArray_Type, tp_descr,
        shape.size(),
        shape.empty() ? nullptr : &shape.front(),
        strides.empty() ? nullptr : &strides.front(),
        mapped, ary_flags, /*obj*/nullptr));
    PyArrayObject *result_arr = (PyArrayObject *) result.ptr();

    py::object map_py(handle_from_new_ptr(map.release()));
    PyArray_SetBaseObject(result_arr, map_py.ptr());
    Py_INCREF(map_py.ptr());

    return py::make_tuple(
        result,
        handle_from_new_ptr(new event(evt_handle)),
        row_pitch, slice_pitch);
  }
#endif

  // }}}


#if PYOPENCL_CL_VERSION >= 0x2000

  // {{{ svm pointer

  class size_not_available { };

  class svm_pointer
  {
    public:
      virtual void *svm_ptr() const = 0;
      // may throw size_not_available
      virtual size_t size() const = 0;
      virtual ~svm_pointer()
      { }
  };

  // }}}


  // {{{ svm_arg_wrapper

  class svm_arg_wrapper : public svm_pointer
  {
    private:
      void *m_ptr;
      PYOPENCL_BUFFER_SIZE_T m_size;
      std::unique_ptr<py_buffer_wrapper> ward;

    public:
      svm_arg_wrapper(py::object holder)
      {
        ward = std::unique_ptr<py_buffer_wrapper>(new py_buffer_wrapper);
#ifdef PYPY_VERSION
        // FIXME: get a read-only buffer
        // Not quite honest, but Pypy doesn't consider numpy arrays
        // created from objects with the __aray_interface__ writeable.
        ward->get(holder.ptr(), PyBUF_ANY_CONTIGUOUS);
#else
        ward->get(holder.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE);
#endif
        m_ptr = ward->m_buf.buf;
        m_size = ward->m_buf.len;
      }

      void *svm_ptr() const
      {
        return m_ptr;
      }
      size_t size() const
      {
        return m_size;
      }
  };

  // }}}


  // {{{ svm_allocation

  class svm_allocation : public svm_pointer
  {
    private:
      std::shared_ptr<context> m_context;
      void *m_allocation;
      size_t m_size;
      command_queue_ref m_queue;
      // FIXME Should maybe also allow keeping a list of events so that we can
      // wait for users to finish in the case of out-of-order queues.

    public:
      svm_allocation(std::shared_ptr<context> const &ctx, size_t size, cl_uint alignment,
          cl_svm_mem_flags flags, const command_queue *queue = nullptr)
        : m_context(ctx), m_size(size)
      {
        if (queue)
        {
          m_queue.set(queue->data());
          if (is_queue_out_of_order(m_queue.data()))
            throw error("SVMAllocation.__init__", CL_INVALID_VALUE,
                "supplying an out-of-order queue to SVMAllocation is invalid");
        }

        if (size)
        {
          int try_count = 0;
          while (try_count < 2)
          {
            PYOPENCL_PRINT_CALL_TRACE("clSVMalloc");
            m_allocation = clSVMAlloc(
                ctx->data(),
                flags, size, alignment);
            if (m_allocation)
              return;

            ++try_count;
            run_python_gc();
          }

          if (!m_allocation)
            throw pyopencl::error("clSVMAlloc", CL_OUT_OF_RESOURCES);
        }
      }

      svm_allocation(std::shared_ptr<context> const &ctx, void *allocation, size_t size,
           const cl_command_queue queue)
        : m_context(ctx), m_allocation(allocation), m_size(size)
      {
        if (queue)
        {
          if (is_queue_out_of_order(queue))
          {
            release();
            throw error("SVMAllocation.__init__", CL_INVALID_VALUE,
                "supplying an out-of-order queue to SVMAllocation is invalid");
          }
          m_queue.set(queue);
        }
      }

      svm_allocation(const svm_allocation &) = delete;
      svm_allocation &operator=(const svm_allocation &) = delete;

      ~svm_allocation()
      {
        if (m_allocation)
          release();
      }

      void release()
      {
        if (m_size == 0)
          return;

        if (!m_allocation)
          throw error("SVMAllocation.release", CL_INVALID_VALUE,
              "trying to double-unref svm allocation");

        if (m_queue.is_valid())
        {
          PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueSVMFree, (
                m_queue.data(), 1, &m_allocation,
                nullptr, nullptr,
                0, nullptr, nullptr));
          m_queue.reset();
        }
        else
        {
          PYOPENCL_PRINT_CALL_TRACE("clSVMFree");
          clSVMFree(m_context->data(), m_allocation);
        }
        m_allocation = nullptr;
      }

      event *enqueue_release(command_queue *queue, py::object py_wait_for)
      {
        PYOPENCL_PARSE_WAIT_FOR;

        if (m_size && !m_allocation)
          throw error("SVMAllocation.enqueue_release", CL_INVALID_VALUE,
              "trying to enqueue_release on an already-freed allocation");

        cl_command_queue use_queue;
        if (queue)
          use_queue = queue->data();
        else
        {
          if (m_queue.is_valid())
            use_queue = m_queue.data();
          else
            throw error("SVMAllocation.enqueue_release", CL_INVALID_VALUE,
                "no implicit queue available, must be provided explicitly");
        }

        cl_event evt;

        if (m_size == 0)
        {
          // We need to get an event from somewhere...
          // We're using SVM, we must have 2.0 > 1.2.
          PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueMarkerWithWaitList,
                      (use_queue, PYOPENCL_WAITLIST_ARGS, &evt));
        }
        else
        {
          PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueSVMFree, (
                use_queue, 1, &m_allocation,
                nullptr, nullptr,
                PYOPENCL_WAITLIST_ARGS, &evt));
        }

        m_allocation = nullptr;

        PYOPENCL_RETURN_NEW_EVENT(evt);
      }

      void *svm_ptr() const
      {
        return m_allocation;
      }

      size_t size() const
      {
        return m_size;
      }

      bool operator==(svm_allocation const &other) const
      {
        return m_allocation == other.m_allocation;
      }

      bool operator!=(svm_allocation const &other) const
      {
        return m_allocation != other.m_allocation;
      }

      void bind_to_queue(command_queue const &queue)
      {
        if (is_queue_out_of_order(queue.data()))
          throw error("SVMAllocation.bind_to_queue", CL_INVALID_VALUE,
              "supplying an out-of-order queue to SVMAllocation is invalid");

        if (m_queue.is_valid())
        {
          if (m_queue.data() != queue.data())
          {
            // make sure synchronization promises stay valid in new queue
            cl_event evt;

            PYOPENCL_CALL_GUARDED(clEnqueueMarker, (m_queue.data(), &evt));
            PYOPENCL_CALL_GUARDED(clEnqueueMarkerWithWaitList,
                (queue.data(), 1, &evt, nullptr));
          }
        }

        m_queue.set(queue.data());
      }

      void unbind_from_queue()
      {
        if (m_queue.is_valid())
          PYOPENCL_CALL_GUARDED_THREADED(clFinish, (m_queue.data()));

        m_queue.reset();
      }

      // only use for testing/diagnostic/debugging purposes!
      cl_command_queue queue() const
      {
        if (m_queue.is_valid())
          return m_queue.data();
        else
          return nullptr;
      }
  };

  // }}}


  // {{{ svm operations

  inline
  event *enqueue_svm_memcpy(
      command_queue &cq,
      cl_bool is_blocking,
      svm_pointer &dst, svm_pointer &src,
      py::object py_wait_for,
      py::object byte_count_py
      )
  {
    PYOPENCL_PARSE_WAIT_FOR;

    // {{{ process size

    PYOPENCL_GET_SVM_SIZE(src);
    PYOPENCL_GET_SVM_SIZE(dst);

    size_t size = 0;
    bool have_size = false;

    if (src_has_size)
    {
      size = src_size;
      have_size = true;
    }
    if (dst_has_size)
    {
      if (have_size)
      {
        if (!byte_count_py.is_none())
          size = std::min(size, dst_size);
        else if (size != dst_size)
          throw error("_enqueue_svm_memcpy", CL_INVALID_VALUE,
              "sizes of source and destination buffer do not match");
      }
      else
      {
        size = dst_size;
        have_size = true;
      }
    }

    if (!byte_count_py.is_none())
    {
      size_t byte_count = py::cast<size_t>(byte_count_py);
      if (have_size && byte_count > size)
        throw error("_enqueue_svm_memcpy", CL_INVALID_VALUE,
            "specified byte_count larger than size of source or destination buffers");
      size = byte_count;
      have_size = true;
    }

    if (!have_size)
      throw error("_enqueue_svm_memcpy", CL_INVALID_VALUE,
          "size not passed and could not be determined");

    // }}}

    cl_event evt;
    PYOPENCL_CALL_GUARDED(
        clEnqueueSVMMemcpy,
        (
          cq.data(),
          is_blocking,
          dst.svm_ptr(), src.svm_ptr(),
          size,
          PYOPENCL_WAITLIST_ARGS,
          &evt
        ));

    PYOPENCL_RETURN_NEW_EVENT(evt);
  }


  inline
  event *enqueue_svm_memfill(
      command_queue &cq,
      svm_pointer &dst, py::object py_pattern,
      py::object byte_count,
      py::object py_wait_for
      )
  {
    PYOPENCL_PARSE_WAIT_FOR;

    const void *pattern_ptr;
    PYOPENCL_BUFFER_SIZE_T pattern_len;

    std::unique_ptr<py_buffer_wrapper> pattern_ward(new py_buffer_wrapper);

    pattern_ward->get(py_pattern.ptr(), PyBUF_ANY_CONTIGUOUS);

    pattern_ptr = pattern_ward->m_buf.buf;
    pattern_len = pattern_ward->m_buf.len;

    // {{{ process size

    PYOPENCL_GET_SVM_SIZE(dst);

    size_t size = 0;
    bool have_size = false;
    if (dst_has_size)
    {
      size = dst_size;
      have_size = true;
    }
    if (!byte_count.is_none())
    {
      size_t user_size = py::cast<size_t>(byte_count);
      if (have_size && user_size > size)
        throw error("enqueue_svm_memfill", CL_INVALID_VALUE,
            "byte_count too large for specified SVM buffer");
    }

    if (!have_size)
    {
      throw error("enqueue_svm_memfill", CL_INVALID_VALUE,
          "byte_count not passed and could not be determined");
    }

    // }}}

    cl_event evt;
    PYOPENCL_CALL_GUARDED(
        clEnqueueSVMMemFill,
        (
          cq.data(),
          dst.svm_ptr(), pattern_ptr,
          pattern_len,
          size,
          PYOPENCL_WAITLIST_ARGS,
          &evt
        ));

    PYOPENCL_RETURN_NEW_EVENT(evt);
  }


  inline
  event *enqueue_svm_map(
      command_queue &cq,
      cl_bool is_blocking,
      cl_map_flags flags,
      svm_pointer &svm,
      py::object py_wait_for,
      py::object user_size_py
      )
  {
    PYOPENCL_PARSE_WAIT_FOR;

    // {{{ process size

    PYOPENCL_GET_SVM_SIZE(svm);

    size_t size = 0;
    bool have_size = false;
    if (svm_has_size)
    {
      size = svm_size;
      have_size = true;
    }
    if (!user_size_py.is_none())
    {
      size_t user_size = py::cast<size_t>(user_size_py);
      if (have_size && user_size > size)
        throw error("enqueue_svm_memfill", CL_INVALID_VALUE,
            "user-provided size too large for specified SVM buffer");
    }

    if (!have_size)
    {
      throw error("enqueue_svm_mem_map", CL_INVALID_VALUE,
          "size not passed and could not be determined");
    }

    // }}}

    cl_event evt;
    PYOPENCL_CALL_GUARDED(
        clEnqueueSVMMap,
        (
          cq.data(),
          is_blocking,
          flags,
          svm.svm_ptr(), size,
          PYOPENCL_WAITLIST_ARGS,
          &evt
        ));

    PYOPENCL_RETURN_NEW_EVENT(evt);
  }


  inline
  event *enqueue_svm_unmap(
      command_queue &cq,
      svm_pointer &svm,
      py::object py_wait_for
      )
  {
    PYOPENCL_PARSE_WAIT_FOR;

    cl_event evt;
    PYOPENCL_CALL_GUARDED(
        clEnqueueSVMUnmap,
        (
          cq.data(),
          svm.svm_ptr(),
          PYOPENCL_WAITLIST_ARGS,
          &evt
        ));

    PYOPENCL_RETURN_NEW_EVENT(evt);
  }
#endif


#if PYOPENCL_CL_VERSION >= 0x2010
  inline
  event *enqueue_svm_migratemem(
      command_queue &cq,
      py::sequence svms,
      cl_mem_migration_flags flags,
      py::object py_wait_for
      )
  {
    PYOPENCL_PARSE_WAIT_FOR;

    std::vector<const void *> svm_pointers;
    std::vector<size_t> sizes;

    for (py::handle py_svm: svms)
    {
      svm_pointer &svm(py::cast<svm_pointer &>(py_svm));

      svm_pointers.push_back(svm.svm_ptr());
      sizes.push_back(svm.size());
    }

    cl_event evt;
    PYOPENCL_CALL_GUARDED(
        clEnqueueSVMMigrateMem,
        (
         cq.data(),
         svm_pointers.size(),
         svm_pointers.empty() ? nullptr : &svm_pointers.front(),
         sizes.empty() ? nullptr : &sizes.front(),
         flags,
         PYOPENCL_WAITLIST_ARGS,
         &evt
        ));

    PYOPENCL_RETURN_NEW_EVENT(evt);
  }
#endif

  // }}}


  // {{{ sampler

  class sampler : noncopyable
  {
    private:
      cl_sampler m_sampler;

    public:
#if PYOPENCL_CL_VERSION >= 0x2000
      sampler(context const &ctx, py::sequence py_props)
      {
        int hex_plat_version = ctx.get_hex_platform_version();

        if (hex_plat_version  < 0x2000)
        {
          std::cerr <<
            "sampler properties given as an iterable, "
            "which uses an OpenCL 2+-only interface, "
            "but the context's platform does not "
            "declare OpenCL 2 support. Proceeding "
            "as requested, but the next thing you see "
            "may be a crash." << std:: endl;
        }

        PYOPENCL_STACK_CONTAINER(cl_sampler_properties, props, py::len(py_props) + 1);
        {
          size_t i = 0;
          for (auto prop: py_props)
            props[i++] = py::cast<cl_sampler_properties>(prop);
          props[i++] = 0;
        }

        cl_int status_code;
        PYOPENCL_PRINT_CALL_TRACE("clCreateSamplerWithProperties");

        m_sampler = clCreateSamplerWithProperties(
            ctx.data(),
            PYOPENCL_STACK_CONTAINER_GET_PTR(props),
            &status_code);

        if (status_code != CL_SUCCESS)
          throw pyopencl::error("Sampler", status_code);
      }
#endif

      sampler(context const &ctx, bool normalized_coordinates,
          cl_addressing_mode am, cl_filter_mode fm)
      {
        PYOPENCL_PRINT_CALL_TRACE("clCreateSampler");

        int hex_plat_version = ctx.get_hex_platform_version();
#if PYOPENCL_CL_VERSION >= 0x2000
        if (hex_plat_version  >= 0x2000)
        {
            cl_sampler_properties props_list[] = {
              CL_SAMPLER_NORMALIZED_COORDS, normalized_coordinates,
              CL_SAMPLER_ADDRESSING_MODE, am,
              CL_SAMPLER_FILTER_MODE, fm,
              0,
            };

            cl_int status_code;

            PYOPENCL_PRINT_CALL_TRACE("clCreateSamplerWithProperties");
            m_sampler = clCreateSamplerWithProperties(
                ctx.data(), props_list, &status_code);

            if (status_code != CL_SUCCESS)
              throw pyopencl::error("Sampler", status_code);
        }
        else
#endif
        {
          cl_int status_code;

#if defined(__GNUG__) && !defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
          m_sampler = clCreateSampler(
              ctx.data(),
              normalized_coordinates,
              am, fm, &status_code);
#if defined(__GNUG__) && !defined(__clang__)
#pragma GCC diagnostic pop
#endif

          if (status_code != CL_SUCCESS)
            throw pyopencl::error("Sampler", status_code);
        }
      }

      sampler(cl_sampler samp, bool retain)
        : m_sampler(samp)
      {
        if (retain)
          PYOPENCL_CALL_GUARDED(clRetainSampler, (samp));
      }

      ~sampler()
      {
        PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseSampler, (m_sampler));
      }

      cl_sampler data() const
      {
        return m_sampler;
      }

      PYOPENCL_EQUALITY_TESTS(sampler);

      py::object get_info(cl_sampler_info param_name) const
      {
        switch (param_name)
        {
          case CL_SAMPLER_REFERENCE_COUNT:
            PYOPENCL_GET_TYPED_INFO(Sampler, m_sampler, param_name,
                cl_uint);
          case CL_SAMPLER_CONTEXT:
            PYOPENCL_GET_OPAQUE_INFO(Sampler, m_sampler, param_name,
                cl_context, context);
          case CL_SAMPLER_ADDRESSING_MODE:
            PYOPENCL_GET_TYPED_INFO(Sampler, m_sampler, param_name,
                cl_addressing_mode);
          case CL_SAMPLER_FILTER_MODE:
            PYOPENCL_GET_TYPED_INFO(Sampler, m_sampler, param_name,
                cl_filter_mode);
          case CL_SAMPLER_NORMALIZED_COORDS:
            PYOPENCL_GET_TYPED_INFO(Sampler, m_sampler, param_name,
                cl_bool);
#if PYOPENCL_CL_VERSION >= 0x3000
          case CL_SAMPLER_PROPERTIES:
            {
              std::vector<cl_sampler_properties> result;
              PYOPENCL_GET_VEC_INFO(Sampler, m_sampler, param_name, result);
              PYOPENCL_RETURN_VECTOR(cl_sampler_properties, result);
            }
#endif

#ifdef CL_SAMPLER_MIP_FILTER_MODE_KHR
          case CL_SAMPLER_MIP_FILTER_MODE_KHR:
            PYOPENCL_GET_TYPED_INFO(Sampler, m_sampler, param_name,
                cl_filter_mode);
          case CL_SAMPLER_LOD_MIN_KHR:
          case CL_SAMPLER_LOD_MAX_KHR:
            PYOPENCL_GET_TYPED_INFO(Sampler, m_sampler, param_name, float);
#endif

          default:
            throw error("Sampler.get_info", CL_INVALID_VALUE);
        }
      }
  };

  // }}}


  // {{{ program

  class program : noncopyable
  {
    public:
      enum program_kind_type { KND_UNKNOWN, KND_SOURCE, KND_BINARY, KND_IL };

    private:
      cl_program m_program;
      program_kind_type m_program_kind;

    public:
      program(cl_program prog, bool retain, program_kind_type progkind=KND_UNKNOWN)
        : m_program(prog), m_program_kind(progkind)
      {
        if (retain)
          PYOPENCL_CALL_GUARDED(clRetainProgram, (prog));
      }

      ~program()
      {
        PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseProgram, (m_program));
      }

      cl_program data() const
      {
        return m_program;
      }

      program_kind_type kind() const
      {
        return m_program_kind;
      }

      PYOPENCL_EQUALITY_TESTS(program);

      py::object get_info(cl_program_info param_name) const
      {
        switch (param_name)
        {
          case CL_PROGRAM_REFERENCE_COUNT:
            PYOPENCL_GET_TYPED_INFO(Program, m_program, param_name,
                cl_uint);
          case CL_PROGRAM_CONTEXT:
            PYOPENCL_GET_OPAQUE_INFO(Program, m_program, param_name,
                cl_context, context);
          case CL_PROGRAM_NUM_DEVICES:
            PYOPENCL_GET_TYPED_INFO(Program, m_program, param_name, cl_uint);
          case CL_PROGRAM_DEVICES:
            {
              std::vector<cl_device_id> result;
              PYOPENCL_GET_VEC_INFO(Program, m_program, param_name, result);

              py::list py_result;
              for (cl_device_id did: result)
                py_result.append(handle_from_new_ptr(
                      new pyopencl::device(did)));
              return py_result;
            }
          case CL_PROGRAM_SOURCE:
            PYOPENCL_GET_STR_INFO(Program, m_program, param_name);
          case CL_PROGRAM_BINARY_SIZES:
            {
              std::vector<size_t> result;
              PYOPENCL_GET_VEC_INFO(Program, m_program, param_name, result);
              PYOPENCL_RETURN_VECTOR(size_t, result);
            }
          case CL_PROGRAM_BINARIES:
            // {{{
            {
              std::vector<size_t> sizes;
              PYOPENCL_GET_VEC_INFO(Program, m_program, CL_PROGRAM_BINARY_SIZES, sizes);

              size_t total_size = std::accumulate(sizes.begin(), sizes.end(), 0);

              std::unique_ptr<unsigned char []> result(
                  new unsigned char[total_size]);
              std::vector<unsigned char *> result_ptrs;

              unsigned char *ptr = result.get();
              for (unsigned i = 0; i < sizes.size(); ++i)
              {
                result_ptrs.push_back(ptr);
                ptr += sizes[i];
              }

              PYOPENCL_CALL_GUARDED(clGetProgramInfo,
                  (m_program, param_name, sizes.size()*sizeof(unsigned char *),
                   result_ptrs.empty( ) ? nullptr : &result_ptrs.front(), 0)); \

              py::list py_result;
              ptr = result.get();
              for (unsigned i = 0; i < sizes.size(); ++i)
              {
                py::object binary_pyobj(
                    py::steal<py::object>(
#if PY_VERSION_HEX >= 0x03000000
                    PyBytes_FromStringAndSize(
                      reinterpret_cast<char *>(ptr), sizes[i])
#else
                    PyString_FromStringAndSize(
                      reinterpret_cast<char *>(ptr), sizes[i])
#endif
                    ));
                py_result.append(binary_pyobj);
                ptr += sizes[i];
              }
              return py_result;
            }
            // }}}
#if PYOPENCL_CL_VERSION >= 0x1020
          case CL_PROGRAM_NUM_KERNELS:
            PYOPENCL_GET_TYPED_INFO(Program, m_program, param_name,
                size_t);
          case CL_PROGRAM_KERNEL_NAMES:
            PYOPENCL_GET_STR_INFO(Program, m_program, param_name);
#endif
#if PYOPENCL_CL_VERSION >= 0x2010
          case CL_PROGRAM_IL:
            PYOPENCL_GET_STR_INFO(Program, m_program, param_name);
#endif
#if PYOPENCL_CL_VERSION >= 0x2020
          case CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT:
          case CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT:
            PYOPENCL_GET_TYPED_INFO(Program, m_program, param_name, cl_bool);
#endif

          default:
            throw error("Program.get_info", CL_INVALID_VALUE);
        }
      }

      py::object get_build_info(
          device const &dev,
          cl_program_build_info param_name) const
      {
        switch (param_name)
        {
#define PYOPENCL_FIRST_ARG m_program, dev.data() // hackety hack
          case CL_PROGRAM_BUILD_STATUS:
            PYOPENCL_GET_TYPED_INFO(ProgramBuild,
                PYOPENCL_FIRST_ARG, param_name,
                cl_build_status);
          case CL_PROGRAM_BUILD_OPTIONS:
          case CL_PROGRAM_BUILD_LOG:
            PYOPENCL_GET_STR_INFO(ProgramBuild,
                PYOPENCL_FIRST_ARG, param_name);
#if PYOPENCL_CL_VERSION >= 0x1020
          case CL_PROGRAM_BINARY_TYPE:
            PYOPENCL_GET_TYPED_INFO(ProgramBuild,
                PYOPENCL_FIRST_ARG, param_name,
                cl_program_binary_type);
#endif
#if PYOPENCL_CL_VERSION >= 0x2000
          case CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE:
            PYOPENCL_GET_TYPED_INFO(ProgramBuild,
                PYOPENCL_FIRST_ARG, param_name,
                size_t);
#endif
#undef PYOPENCL_FIRST_ARG

          default:
            throw error("Program.get_build_info", CL_INVALID_VALUE);
        }
      }

      void build(py::bytes options, py::object py_devices)
      {
        PYOPENCL_PARSE_PY_DEVICES;

        PYOPENCL_CALL_GUARDED_THREADED(clBuildProgram,
            (m_program, num_devices, devices,
             options.c_str(), 0 ,0));
      }

#if PYOPENCL_CL_VERSION >= 0x1020
      void compile(py::bytes options, py::object py_devices,
          py::object py_headers)
      {
        PYOPENCL_PARSE_PY_DEVICES;

        // {{{ pick apart py_headers
        // py_headers is a list of tuples *(name, program)*

        std::vector<std::string> header_names;
        std::vector<cl_program> programs;
        for (py::handle name_hdr_tup_py: py_headers)
        {
          py::tuple name_hdr_tup = py::borrow<py::tuple>(name_hdr_tup_py);
          if (py::len(name_hdr_tup) != 2)
            throw error("Program.compile", CL_INVALID_VALUE,
                "epxected (name, header) tuple in headers list");
          std::string name = py::cast<std::string>(name_hdr_tup[0]);
          program &prg = py::cast<program &>(name_hdr_tup[1]);

          header_names.push_back(name);
          programs.push_back(prg.data());
        }

        std::vector<const char *> header_name_ptrs;
        for (std::string const &name: header_names)
          header_name_ptrs.push_back(name.c_str());

        // }}}

        PYOPENCL_CALL_GUARDED_THREADED(clCompileProgram,
            (m_program, num_devices, devices,
             options.c_str(), header_names.size(),
             programs.empty() ? nullptr : &programs.front(),
             header_name_ptrs.empty() ? nullptr : &header_name_ptrs.front(),
             0, 0));
      }
#endif

#if PYOPENCL_CL_VERSION >= 0x2020
      void set_specialization_constant(cl_uint spec_id, py::object py_buffer)
      {
        py_buffer_wrapper bufwrap;
        bufwrap.get(py_buffer.ptr(), PyBUF_ANY_CONTIGUOUS);
        PYOPENCL_CALL_GUARDED(clSetProgramSpecializationConstant,
            (m_program, spec_id, bufwrap.m_buf.len, bufwrap.m_buf.buf));
      }
#endif
  };




  inline
  void create_program_with_source(
      program *self,
      context &ctx,
      std::string const &src)
  {
    const char *string = src.c_str();
    size_t length = src.size();

    cl_int status_code;
    PYOPENCL_PRINT_CALL_TRACE("clCreateProgramWithSource");
    cl_program result = clCreateProgramWithSource(
        ctx.data(), 1, &string, &length, &status_code);
    if (status_code != CL_SUCCESS)
      throw pyopencl::error("clCreateProgramWithSource", status_code);

    try
    {
      new (self) program(result, false, program::KND_SOURCE);
    }
    catch (...)
    {
      clReleaseProgram(result);
      throw;
    }
  }





  inline
  void create_program_with_binary(
      program *self,
      context &ctx,
      py::sequence py_devices,
      py::sequence py_binaries)
  {
    std::vector<cl_device_id> devices;
    std::vector<const unsigned char *> binaries;
    std::vector<size_t> sizes;

    size_t num_devices = len(py_devices);
    if (len(py_binaries) != num_devices)
      throw error("create_program_with_binary", CL_INVALID_VALUE,
          "device and binary counts don't match");

    for (size_t i = 0; i < num_devices; ++i)
    {
      devices.push_back(py::cast<device const &>(py_devices[i]).data());
      const void *buf;
      PYOPENCL_BUFFER_SIZE_T len;

      py_buffer_wrapper buf_wrapper;

      buf_wrapper.get(py::object(py_binaries[i]).ptr(), PyBUF_ANY_CONTIGUOUS);

      buf = buf_wrapper.m_buf.buf;
      len = buf_wrapper.m_buf.len;

      binaries.push_back(reinterpret_cast<const unsigned char *>(buf));
      sizes.push_back(len);
    }

    PYOPENCL_STACK_CONTAINER(cl_int, binary_statuses, num_devices);

    cl_int status_code;
    PYOPENCL_PRINT_CALL_TRACE("clCreateProgramWithBinary");
    cl_program result = clCreateProgramWithBinary(
        ctx.data(), num_devices,
        devices.empty( ) ? nullptr : &devices.front(),
        sizes.empty( ) ? nullptr : &sizes.front(),
        binaries.empty( ) ? nullptr : &binaries.front(),
        PYOPENCL_STACK_CONTAINER_GET_PTR(binary_statuses),
        &status_code);
    if (status_code != CL_SUCCESS)
      throw pyopencl::error("clCreateProgramWithBinary", status_code);

    /*
    for (int i = 0; i < num_devices; ++i)
      printf("%d:%d\n", i, binary_statuses[i]);
      */

    try
    {
      new (self) program(result, false, program::KND_BINARY);
    }
    catch (...)
    {
      clReleaseProgram(result);
      throw;
    }
  }



#if (PYOPENCL_CL_VERSION >= 0x1020) || \
      ((PYOPENCL_CL_VERSION >= 0x1030) && defined(__APPLE__))
  inline
  program *create_program_with_built_in_kernels(
      context &ctx,
      py::object py_devices,
      std::string const &kernel_names)
  {
    PYOPENCL_PARSE_PY_DEVICES;

    cl_int status_code;
    PYOPENCL_PRINT_CALL_TRACE("clCreateProgramWithBuiltInKernels");
    cl_program result = clCreateProgramWithBuiltInKernels(
        ctx.data(), num_devices, devices,
        kernel_names.c_str(), &status_code);
    if (status_code != CL_SUCCESS)
      throw pyopencl::error("clCreateProgramWithBuiltInKernels", status_code);

    try
    {
      return new program(result, false);
    }
    catch (...)
    {
      clReleaseProgram(result);
      throw;
    }
  }
#endif



#if (PYOPENCL_CL_VERSION >= 0x2010)
  inline
  program *create_program_with_il(
      context &ctx,
      py::bytes const &src)
  {
    cl_int status_code;
    PYOPENCL_PRINT_CALL_TRACE("clCreateProgramWithIL");
    cl_program result = clCreateProgramWithIL(
        ctx.data(), src.c_str(), src.size(), &status_code);
    if (status_code != CL_SUCCESS)
      throw pyopencl::error("clCreateProgramWithIL", status_code);

    try
    {
      return new program(result, false, program::KND_IL);
    }
    catch (...)
    {
      clReleaseProgram(result);
      throw;
    }
  }
#endif





#if PYOPENCL_CL_VERSION >= 0x1020
  inline
  program *link_program(
      context &ctx,
      py::object py_programs,
      py::bytes options,
      py::object py_devices
      )
  {
    PYOPENCL_PARSE_PY_DEVICES;

    std::vector<cl_program> programs;
    for (py::handle py_prg: py_programs)
    {
      program &prg = py::cast<program &>(py_prg);
      programs.push_back(prg.data());
    }

    cl_int status_code;
    PYOPENCL_PRINT_CALL_TRACE("clLinkProgram");
    cl_program result = clLinkProgram(
        ctx.data(), num_devices, devices,
        options.c_str(),
        programs.size(),
        programs.empty() ? nullptr : &programs.front(),
        0, 0,
        &status_code);

    if (status_code != CL_SUCCESS)
      throw pyopencl::error("clLinkProgram", result, status_code);

    try
    {
      return new program(result, false);
    }
    catch (...)
    {
      clReleaseProgram(result);
      throw;
    }
  }

#endif


#if PYOPENCL_CL_VERSION >= 0x1020
  inline
  void unload_platform_compiler(platform &plat)
  {
    PYOPENCL_CALL_GUARDED(clUnloadPlatformCompiler, (plat.data()));
  }
#endif

  // }}}


  // {{{ kernel
  class local_memory
  {
    private:
      size_t m_size;

    public:
      local_memory(size_t size)
        : m_size(size)
      { }

      size_t size() const
      { return m_size; }
  };




  class kernel : noncopyable
  {
    private:
      cl_kernel m_kernel;
      bool m_set_arg_prefer_svm;

    public:
      kernel(cl_kernel knl, bool retain)
        : m_kernel(knl), m_set_arg_prefer_svm(false)
      {
        if (retain)
          PYOPENCL_CALL_GUARDED(clRetainKernel, (knl));
      }

      kernel(program const &prg, std::string const &kernel_name)
        : m_set_arg_prefer_svm(false)
      {
        cl_int status_code;

        PYOPENCL_PRINT_CALL_TRACE("clCreateKernel");
        m_kernel = clCreateKernel(prg.data(), kernel_name.c_str(),
            &status_code);
        if (status_code != CL_SUCCESS)
          throw pyopencl::error("clCreateKernel", status_code);
      }

      ~kernel()
      {
        PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseKernel, (m_kernel));
      }

      cl_kernel data() const
      {
        return m_kernel;
      }

      PYOPENCL_EQUALITY_TESTS(kernel);

#if PYOPENCL_CL_VERSION >= 0x2010
      kernel *clone()
      {
        cl_int status_code;

        PYOPENCL_PRINT_CALL_TRACE("clCloneKernel");
        cl_kernel result = clCloneKernel(m_kernel, &status_code);
        if (status_code != CL_SUCCESS)
          throw pyopencl::error("clCloneKernel", status_code);

        try
        {
          return new kernel(result, /* retain */ false);
        }
        catch (...)
        {
          PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseKernel, (result));
          throw;
        }
      }
#endif

      void set_arg_null(cl_uint arg_index)
      {
        cl_mem m = 0;
        PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index,
              sizeof(cl_mem), &m));
      }

      void set_arg_mem(cl_uint arg_index, memory_object_holder &moh)
      {
        cl_mem m = moh.data();
        PYOPENCL_CALL_GUARDED(clSetKernelArg,
            (m_kernel, arg_index, sizeof(cl_mem), &m));
      }

      void set_arg_local(cl_uint arg_index, local_memory const &loc)
      {
        PYOPENCL_CALL_GUARDED(clSetKernelArg,
            (m_kernel, arg_index, loc.size(), 0));
      }

      void set_arg_sampler(cl_uint arg_index, sampler const &smp)
      {
        cl_sampler s = smp.data();
        PYOPENCL_CALL_GUARDED(clSetKernelArg,
            (m_kernel, arg_index, sizeof(cl_sampler), &s));
      }

      void set_arg_command_queue(cl_uint arg_index, command_queue const &queue)
      {
        cl_command_queue q = queue.data();
        PYOPENCL_CALL_GUARDED(clSetKernelArg,
            (m_kernel, arg_index, sizeof(cl_command_queue), &q));
      }

      void set_arg_buf_pack(cl_uint arg_index, py::handle py_typechar, py::handle obj)
      {
        py::bytes typechar_str(py::cast<py::bytes>(py_typechar));
        if (typechar_str.size() != 1)
          throw error("Kernel.set_arg_buf_pack", CL_INVALID_VALUE,
              "type char argument must have exactly one character");

        char typechar = *typechar_str.c_str();

#define PYOPENCL_KERNEL_PACK_AND_SET_ARG(TYPECH_VAL, TYPE, CAST_TYPE) \
        case TYPECH_VAL: \
          { \
            TYPE val = (TYPE) py::cast<CAST_TYPE>(obj); \
            PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index, sizeof(val), &val)); \
            break; \
          }
        switch (typechar)
        {
          // FIXME: nanobind thinks of char as "short string", not number
          // The detour via 'int' may lose data.
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('c', char, int)
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('b', signed char, int)
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('B', unsigned char, int)

          PYOPENCL_KERNEL_PACK_AND_SET_ARG('h', short, short)
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('H', unsigned short, unsigned short)
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('i', int, int)
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('I', unsigned int, unsigned int)
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('l', long, long)
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('L', unsigned long, unsigned long)
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('f', float, float)
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('d', double, double)
          default:
            throw error("Kernel.set_arg_buf_pack", CL_INVALID_VALUE,
                "invalid type char");
        }
#undef PYOPENCL_KERNEL_PACK_AND_SET_ARG
      }

      void set_arg_buf(cl_uint arg_index, py::handle py_buffer)
      {
        const void *buf;
        PYOPENCL_BUFFER_SIZE_T len;

        py_buffer_wrapper buf_wrapper;

        try
        {
          buf_wrapper.get(py_buffer.ptr(), PyBUF_ANY_CONTIGUOUS);
        }
        catch (py::python_error &)
        {
          PyErr_Clear();
          throw error("Kernel.set_arg", CL_INVALID_VALUE,
              "invalid kernel argument");
        }

        buf = buf_wrapper.m_buf.buf;
        len = buf_wrapper.m_buf.len;

        PYOPENCL_CALL_GUARDED(clSetKernelArg,
            (m_kernel, arg_index, len, buf));
      }

#if PYOPENCL_CL_VERSION >= 0x2000
      void set_arg_svm(cl_uint arg_index, svm_pointer const &wrp)
      {
        PYOPENCL_CALL_GUARDED(clSetKernelArgSVMPointer,
            (m_kernel, arg_index, wrp.svm_ptr()));
      }
#endif

      void set_arg(cl_uint arg_index, py::handle arg)
      {
        if (arg.ptr() == Py_None)
        {
          set_arg_null(arg_index);
          return;
        }

        // It turns out that a taken 'catch' has a relatively high cost, so
        // in deciding which of "mem object" and "svm" to try first, we use
        // whatever we were given last time around.
        if (m_set_arg_prefer_svm)
        {
#if PYOPENCL_CL_VERSION >= 0x2000
          try
          {
            set_arg_svm(arg_index, py::cast<svm_pointer const &>(arg));
            return;
          }
          catch (py::cast_error &) { }
#endif

          try
          {
            set_arg_mem(arg_index, py::cast<memory_object_holder &>(arg));
            m_set_arg_prefer_svm = false;
            return;
          }
          catch (py::cast_error &) { }
        }
        else
        {
          try
          {
            set_arg_mem(arg_index, py::cast<memory_object_holder &>(arg));
            return;
          }
          catch (py::cast_error &) { }

#if PYOPENCL_CL_VERSION >= 0x2000
          try
          {
            set_arg_svm(arg_index, py::cast<svm_pointer const &>(arg));
            m_set_arg_prefer_svm = true;
            return;
          }
          catch (py::cast_error &) { }
#endif
        }

        try
        {
          set_arg_local(arg_index, py::cast<local_memory>(arg));
          return;
        }
        catch (py::cast_error &) { }

        try
        {
          set_arg_sampler(arg_index, py::cast<const sampler &>(arg));
          return;
        }
        catch (py::cast_error &) { }

        try
        {
          set_arg_command_queue(arg_index, py::cast<const command_queue &>(arg));
          return;
        }
        catch (py::cast_error &) { }

        set_arg_buf(arg_index, arg);
      }

      py::object get_info(cl_kernel_info param_name) const
      {
        switch (param_name)
        {
          case CL_KERNEL_FUNCTION_NAME:
            PYOPENCL_GET_STR_INFO(Kernel, m_kernel, param_name);
          case CL_KERNEL_NUM_ARGS:
          case CL_KERNEL_REFERENCE_COUNT:
            PYOPENCL_GET_TYPED_INFO(Kernel, m_kernel, param_name,
                cl_uint);
          case CL_KERNEL_CONTEXT:
            PYOPENCL_GET_OPAQUE_INFO(Kernel, m_kernel, param_name,
                cl_context, context);
          case CL_KERNEL_PROGRAM:
            PYOPENCL_GET_OPAQUE_INFO(Kernel, m_kernel, param_name,
                cl_program, program);
#if PYOPENCL_CL_VERSION >= 0x1020
          case CL_KERNEL_ATTRIBUTES:
            PYOPENCL_GET_STR_INFO(Kernel, m_kernel, param_name);
#endif
          default:
            throw error("Kernel.get_info", CL_INVALID_VALUE);
        }
      }

      py::object get_work_group_info(
          cl_kernel_work_group_info param_name,
          device const &dev
          ) const
      {
        switch (param_name)
        {
#define PYOPENCL_FIRST_ARG m_kernel, dev.data() // hackety hack
          case CL_KERNEL_WORK_GROUP_SIZE:
            PYOPENCL_GET_TYPED_INFO(KernelWorkGroup,
                PYOPENCL_FIRST_ARG, param_name,
                size_t);
          case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
            {
              std::vector<size_t> result;
              PYOPENCL_GET_VEC_INFO(KernelWorkGroup,
                  PYOPENCL_FIRST_ARG, param_name, result);

              PYOPENCL_RETURN_VECTOR(size_t, result);
            }
          case CL_KERNEL_LOCAL_MEM_SIZE:
#if PYOPENCL_CL_VERSION >= 0x1010
          case CL_KERNEL_PRIVATE_MEM_SIZE:
#endif
            PYOPENCL_GET_TYPED_INFO(KernelWorkGroup,
                PYOPENCL_FIRST_ARG, param_name,
                cl_ulong);

#if PYOPENCL_CL_VERSION >= 0x1010
          case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
            PYOPENCL_GET_TYPED_INFO(KernelWorkGroup,
                PYOPENCL_FIRST_ARG, param_name,
                size_t);
#endif
          default:
            throw error("Kernel.get_work_group_info", CL_INVALID_VALUE);
#undef PYOPENCL_FIRST_ARG
        }
      }

#if PYOPENCL_CL_VERSION >= 0x1020
      py::object get_arg_info(
          cl_uint arg_index,
          cl_kernel_arg_info param_name
          ) const
      {
        switch (param_name)
        {
#define PYOPENCL_FIRST_ARG m_kernel, arg_index // hackety hack
          case CL_KERNEL_ARG_ADDRESS_QUALIFIER:
            PYOPENCL_GET_TYPED_INFO(KernelArg,
                PYOPENCL_FIRST_ARG, param_name,
                cl_kernel_arg_address_qualifier);

          case CL_KERNEL_ARG_ACCESS_QUALIFIER:
            PYOPENCL_GET_TYPED_INFO(KernelArg,
                PYOPENCL_FIRST_ARG, param_name,
                cl_kernel_arg_access_qualifier);

          case CL_KERNEL_ARG_TYPE_NAME:
          case CL_KERNEL_ARG_NAME:
            PYOPENCL_GET_STR_INFO(KernelArg, PYOPENCL_FIRST_ARG, param_name);

          case CL_KERNEL_ARG_TYPE_QUALIFIER:
            PYOPENCL_GET_TYPED_INFO(KernelArg,
                PYOPENCL_FIRST_ARG, param_name,
                cl_kernel_arg_type_qualifier);
#undef PYOPENCL_FIRST_ARG
          default:
            throw error("Kernel.get_arg_info", CL_INVALID_VALUE);
        }
      }
#endif

#if PYOPENCL_CL_VERSION >= 0x2010
    py::object get_sub_group_info(
        device const &dev,
        cl_kernel_sub_group_info param_name,
        py::object py_input_value)
    {
      switch (param_name)
      {
        // size_t * -> size_t
        case CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE:
        case CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE:
          {
            std::vector<size_t> input_value;
            COPY_PY_LIST(size_t, input_value);

            size_t param_value;
            PYOPENCL_CALL_GUARDED(clGetKernelSubGroupInfo,
                (m_kernel, dev.data(), param_name,
                 input_value.size()*sizeof(input_value.front()),
                 input_value.empty() ? nullptr : &input_value.front(),
                 sizeof(param_value), &param_value, 0));

            return py::cast(param_value);
          }

        // size_t -> size_t[]
        case CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT:
          {
            size_t input_value = py::cast<size_t>(py_input_value);

            std::vector<size_t> result;
            size_t size;
            PYOPENCL_CALL_GUARDED(clGetKernelSubGroupInfo,
                (m_kernel, dev.data(), param_name,
                 sizeof(input_value), &input_value,
                 0, nullptr, &size));
            result.resize(size / sizeof(result.front()));
            PYOPENCL_CALL_GUARDED(clGetKernelSubGroupInfo,
                (m_kernel, dev.data(), param_name,
                 sizeof(input_value), &input_value,
                 size, result.empty() ? nullptr : &result.front(), 0));

            PYOPENCL_RETURN_VECTOR(size_t, result);
          }

        // () -> size_t
        case CL_KERNEL_MAX_NUM_SUB_GROUPS:
        case CL_KERNEL_COMPILE_NUM_SUB_GROUPS:
          {
            size_t param_value;
            PYOPENCL_CALL_GUARDED(clGetKernelSubGroupInfo,
                (m_kernel, dev.data(), param_name,
                 0, nullptr,
                 sizeof(param_value), &param_value, 0));

            return py::cast(param_value);
          }

        default:
          throw error("Kernel.get_sub_group_info", CL_INVALID_VALUE);
      }
  }
#endif
  };

#define PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER \
    catch (error &err) \
    { \
      std::string msg( \
          std::string("when processing arg#") + std::to_string(arg_index+1) \
          + std::string(" (1-based): ") + std::string(err.what())); \
      \
      auto mod_cl_ary(py::module_::import_("pyopencl.array")); \
      auto cls_array(mod_cl_ary.attr("Array")); \
      int isinstance_result = PyObject_IsInstance(arg_value.ptr(), cls_array.ptr()); \
      if (isinstance_result == -1) \
        throw py::python_error(); \
      \
      if (arg_value.ptr() && isinstance_result) \
        msg.append( \
            " (perhaps you meant to pass 'array.data' instead of the array itself?)"); \
      throw error(err.routine().c_str(), err.code(), msg.c_str()); \
    } \
    catch (std::exception &err) \
    { \
      std::string msg( \
          std::string("when processing arg#") + std::to_string(arg_index+1) \
          + std::string(" (1-based): ") + std::string(err.what())); \
      throw std::runtime_error(msg.c_str()); \
    }

  inline
  void set_arg_multi(
      std::function<void(cl_uint, py::handle)> set_arg_func,
      py::tuple args_and_indices)
  {
    cl_uint arg_index;
    py::handle arg_value;

    auto it = args_and_indices.begin(), end = args_and_indices.end();
    try
    {
      /* This is an internal interface that assumes it gets fed well-formed
       * data.  No meaningful error checking is being performed on
       * off-interval exhaustion of the iterator, on purpose.
       */
      while (it != end)
      {
        // special value in case integer cast fails
        arg_index = 9999 - 1;

        arg_index = py::cast<cl_uint>(*it++);
        arg_value = *it++;
        set_arg_func(arg_index, arg_value);
      }
    }
    PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER
  }


  inline
  void set_arg_multi(
      std::function<void(cl_uint, py::handle, py::handle)> set_arg_func,
      py::tuple args_and_indices)
  {
    cl_uint arg_index;
    py::handle arg_descr, arg_value;

    auto it = args_and_indices.begin(), end = args_and_indices.end();
    try
    {
      /* This is an internal interface that assumes it gets fed well-formed
       * data.  No meaningful error checking is being performed on
       * off-interval exhaustion of the iterator, on purpose.
       */
      while (it != end)
      {
        // special value in case integer cast fails
        arg_index = 9999 - 1;

        arg_index = py::cast<cl_uint>(*it++);
        arg_descr = *it++;
        arg_value = *it++;
        set_arg_func(arg_index, arg_descr, arg_value);
      }
    }
    PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER
  }


  inline
  py::list create_kernels_in_program(program &pgm)
  {
    cl_uint num_kernels;
    PYOPENCL_CALL_GUARDED(clCreateKernelsInProgram, (
          pgm.data(), 0, 0, &num_kernels));

    std::vector<cl_kernel> kernels(num_kernels);
    PYOPENCL_CALL_GUARDED(clCreateKernelsInProgram, (
          pgm.data(), num_kernels,
          kernels.empty( ) ? nullptr : &kernels.front(), &num_kernels));

    py::list result;
    for (cl_kernel knl: kernels)
      result.append(handle_from_new_ptr(new kernel(knl, true)));

    return result;
  }

#define MAX_WS_DIM_COUNT 10

  inline
  event *enqueue_nd_range_kernel(
      command_queue &cq,
      kernel &knl,
      py::handle py_global_work_size,
      py::handle py_local_work_size,
      py::handle py_global_work_offset,
      py::handle py_wait_for,
      bool g_times_l,
      bool allow_empty_ndrange)
  {
    PYOPENCL_PARSE_WAIT_FOR;

    std::array<size_t, MAX_WS_DIM_COUNT> global_work_size;
    unsigned gws_size = 0;
    COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_size, gws_size);
    cl_uint work_dim = gws_size;

    std::array<size_t, MAX_WS_DIM_COUNT> local_work_size;
    unsigned lws_size = 0;
    size_t *local_work_size_ptr = nullptr;

    if (py_local_work_size.ptr() != Py_None)
    {
      COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, local_work_size, lws_size);

      if (g_times_l)
        work_dim = std::max(work_dim, lws_size);
      else
        if (work_dim != lws_size)
          throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE,
              "global/local work sizes have differing dimensions");

      while (lws_size < work_dim)
        local_work_size[lws_size++] = 1;
      while (gws_size < work_dim)
        global_work_size[gws_size++] = 1;

      local_work_size_ptr = &local_work_size.front();
    }

    if (g_times_l && lws_size)
    {
      for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
        global_work_size[work_axis] *= local_work_size[work_axis];
    }

    size_t *global_work_offset_ptr = nullptr;
    std::array<size_t, MAX_WS_DIM_COUNT> global_work_offset;
    if (py_global_work_offset.ptr() != Py_None)
    {
      unsigned gwo_size = 0;
      COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_offset, gwo_size);

      if (work_dim != gwo_size)
        throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE,
            "global work size and offset have differing dimensions");

      if (g_times_l && local_work_size_ptr)
      {
        for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
          global_work_offset[work_axis] *= local_work_size[work_axis];
      }

      global_work_offset_ptr = &global_work_offset.front();
    }

    if (allow_empty_ndrange)
    {
#if PYOPENCL_CL_VERSION >= 0x1020
      bool is_empty = false;
      for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
        if (global_work_size[work_axis] == 0)
          is_empty = true;
      if (local_work_size_ptr)
        for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
          if (local_work_size_ptr[work_axis] == 0)
            is_empty = true;

      if (is_empty)
      {
        cl_event evt;
        PYOPENCL_CALL_GUARDED(clEnqueueMarkerWithWaitList, (
              cq.data(), PYOPENCL_WAITLIST_ARGS, &evt));
        PYOPENCL_RETURN_NEW_EVENT(evt);
      }
#else
      // clEnqueueWaitForEvents + clEnqueueMarker is not equivalent
      // in the case of an out-of-order queue.
      throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE,
          "allow_empty_ndrange requires OpenCL 1.2");
#endif
    }

    PYOPENCL_RETRY_RETURN_IF_MEM_ERROR( {
          cl_event evt;
          PYOPENCL_CALL_GUARDED(clEnqueueNDRangeKernel, (
                cq.data(),
                knl.data(),
                work_dim,
                global_work_offset_ptr,
                &global_work_size.front(),
                local_work_size_ptr,
                PYOPENCL_WAITLIST_ARGS, &evt
                ));
          PYOPENCL_RETURN_NEW_EVENT(evt);
        } );
  }

  // }}}


  // {{{ gl interop
  inline
  bool have_gl()
  {
#ifdef HAVE_GL
    return true;
#else
    return false;
#endif
  }




#ifdef HAVE_GL

#ifdef __APPLE__
  inline
  cl_context_properties get_apple_cgl_share_group()
  {
    CGLContextObj kCGLContext = CGLGetCurrentContext();
    CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext);

    return (cl_context_properties) kCGLShareGroup;
  }
#endif /* __APPLE__ */




  class gl_buffer : public memory_object
  {
    public:
      gl_buffer(cl_mem mem, bool retain, hostbuf_t hostbuf=hostbuf_t())
        : memory_object(mem, retain, std::move(hostbuf))
      { }
  };




  class gl_renderbuffer : public memory_object
  {
    public:
      gl_renderbuffer(cl_mem mem, bool retain, hostbuf_t hostbuf=hostbuf_t())
        : memory_object(mem, retain, std::move(hostbuf))
      { }
  };




  class gl_texture : public image
  {
    public:
      gl_texture(cl_mem mem, bool retain, hostbuf_t hostbuf=hostbuf_t())
        : image(mem, retain, std::move(hostbuf))
      { }

      py::object get_gl_texture_info(cl_gl_texture_info param_name)
      {
        switch (param_name)
        {
          case CL_GL_TEXTURE_TARGET:
            PYOPENCL_GET_TYPED_INFO(GLTexture, data(), param_name, GLenum);
          case CL_GL_MIPMAP_LEVEL:
            PYOPENCL_GET_TYPED_INFO(GLTexture, data(), param_name, GLint);

          default:
            throw error("MemoryObject.get_gl_texture_info", CL_INVALID_VALUE);
        }
      }
  };




#define PYOPENCL_WRAP_BUFFER_CREATOR(TYPE, NAME, CL_NAME, ARGS, CL_ARGS) \
  inline \
  void NAME ARGS \
  { \
    cl_int status_code; \
    PYOPENCL_PRINT_CALL_TRACE(#CL_NAME); \
    cl_mem mem = CL_NAME CL_ARGS; \
    \
    if (status_code != CL_SUCCESS) \
      throw pyopencl::error(#CL_NAME, status_code); \
    \
    try \
    { \
      new (self) TYPE(mem, false); \
    } \
    catch (...) \
    { \
      PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem)); \
      throw; \
    } \
  }




  PYOPENCL_WRAP_BUFFER_CREATOR(gl_buffer,
      create_from_gl_buffer, clCreateFromGLBuffer,
      (gl_buffer *self, context &ctx, cl_mem_flags flags, GLuint bufobj),
      (ctx.data(), flags, bufobj, &status_code));
  PYOPENCL_WRAP_BUFFER_CREATOR(gl_texture,
      create_from_gl_texture_2d, clCreateFromGLTexture2D,
      (gl_texture *self, context &ctx, cl_mem_flags flags,
         GLenum texture_target, GLint miplevel, GLuint texture),
      (ctx.data(), flags, texture_target, miplevel, texture, &status_code));
  PYOPENCL_WRAP_BUFFER_CREATOR(gl_texture,
      create_from_gl_texture_3d, clCreateFromGLTexture3D,
      (gl_texture *self, context &ctx, cl_mem_flags flags,
         GLenum texture_target, GLint miplevel, GLuint texture),
      (ctx.data(), flags, texture_target, miplevel, texture, &status_code));
  PYOPENCL_WRAP_BUFFER_CREATOR(gl_renderbuffer,
      create_from_gl_renderbuffer, clCreateFromGLRenderbuffer,
      (gl_renderbuffer *self, context &ctx, cl_mem_flags flags, GLuint renderbuffer),
      (ctx.data(), flags, renderbuffer, &status_code));

  inline
  void create_from_gl_texture(
      gl_texture *self,
      context &ctx, cl_mem_flags flags,
      GLenum texture_target, GLint miplevel,
      GLuint texture, unsigned dims)
  {
    if (dims == 2)
      return create_from_gl_texture_2d(self, ctx, flags, texture_target, miplevel, texture);
    else if (dims == 3)
      return create_from_gl_texture_3d(self, ctx, flags, texture_target, miplevel, texture);
    else
      throw pyopencl::error("Image", CL_INVALID_VALUE,
          "invalid dimension");
  }





  inline
  py::tuple get_gl_object_info(memory_object_holder const &mem)
  {
    cl_gl_object_type otype;
    GLuint gl_name;
    PYOPENCL_CALL_GUARDED(clGetGLObjectInfo, (mem.data(), &otype, &gl_name));
    return py::make_tuple(otype, gl_name);
  }

#define WRAP_GL_ENQUEUE(what, What) \
  inline \
  event *enqueue_##what##_gl_objects( \
      command_queue &cq, \
      py::object py_mem_objects, \
      py::object py_wait_for) \
  { \
    PYOPENCL_PARSE_WAIT_FOR; \
    \
    std::vector<cl_mem> mem_objects; \
    for (py::handle mo: py_mem_objects) \
      mem_objects.push_back(py::cast<memory_object_holder &>(mo).data()); \
    \
    cl_event evt; \
    PYOPENCL_CALL_GUARDED(clEnqueue##What##GLObjects, ( \
          cq.data(), \
          mem_objects.size(), mem_objects.empty( ) ? nullptr : &mem_objects.front(), \
          PYOPENCL_WAITLIST_ARGS, &evt \
          )); \
    \
    PYOPENCL_RETURN_NEW_EVENT(evt); \
  }

  WRAP_GL_ENQUEUE(acquire, Acquire);
  WRAP_GL_ENQUEUE(release, Release);
#endif




#if defined(cl_khr_gl_sharing) && (cl_khr_gl_sharing >= 1)
  inline
  py::object get_gl_context_info_khr(
      py::object py_properties,
      cl_gl_context_info param_name,
      py::object py_platform
      )
  {
    std::vector<cl_context_properties> props
      = parse_context_properties(py_properties);

    typedef CL_API_ENTRY cl_int (CL_API_CALL
      *func_ptr_type)(const cl_context_properties * /* properties */,
          cl_gl_context_info            /* param_name */,
          size_t                        /* param_value_size */,
          void *                        /* param_value */,
          size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;

    func_ptr_type func_ptr;

#if PYOPENCL_CL_VERSION >= 0x1020
    if (py_platform.ptr() != Py_None)
    {
      platform &plat = py::cast<platform &>(py_platform);

      func_ptr = (func_ptr_type) clGetExtensionFunctionAddressForPlatform(
            plat.data(), "clGetGLContextInfoKHR");
    }
    else
    {
      PYOPENCL_DEPRECATED("get_gl_context_info_khr with platform=None", "2013.1", );

      func_ptr = (func_ptr_type) clGetExtensionFunctionAddress(
            "clGetGLContextInfoKHR");
    }
#else
    func_ptr = (func_ptr_type) clGetExtensionFunctionAddress(
          "clGetGLContextInfoKHR");
#endif


    if (!func_ptr)
      throw error("Context.get_info", CL_INVALID_PLATFORM,
          "clGetGLContextInfoKHR extension function not present");

    cl_context_properties *props_ptr
      = props.empty( ) ? nullptr : &props.front();

    switch (param_name)
    {
      case CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR:
        {
          cl_device_id param_value;
          PYOPENCL_CALL_GUARDED(func_ptr,
              (props_ptr, param_name, sizeof(param_value), &param_value, 0));
          return py::object(handle_from_new_ptr( \
                new device(param_value, /*retain*/ true)));
        }

      case CL_DEVICES_FOR_GL_CONTEXT_KHR:
        {
          size_t size;
          PYOPENCL_CALL_GUARDED(func_ptr,
              (props_ptr, param_name, 0, 0, &size));

          std::vector<cl_device_id> devices;

          devices.resize(size / sizeof(devices.front()));

          PYOPENCL_CALL_GUARDED(func_ptr,
              (props_ptr, param_name, size,
               devices.empty( ) ? nullptr : &devices.front(), &size));

          py::list result;
          for (cl_device_id did: devices)
            result.append(handle_from_new_ptr(
                  new device(did)));

          return result;
        }

      default:
        throw error("get_gl_context_info_khr", CL_INVALID_VALUE);
    }
  }

#endif

  // }}}


  // {{{ deferred implementation bits

#if PYOPENCL_CL_VERSION >= 0x2010
  inline void context::set_default_device_command_queue(device const &dev, command_queue const &queue)
  {
    PYOPENCL_CALL_GUARDED(clSetDefaultDeviceCommandQueue,
        (m_context, dev.data(), queue.data()));
  }
#endif


  inline program *error::get_program() const
  {
    return new program(m_program, /* retain */ true);
  }

  inline py::object create_mem_object_wrapper(cl_mem mem, bool retain=true)
  {
    cl_mem_object_type mem_obj_type;
    PYOPENCL_CALL_GUARDED(clGetMemObjectInfo, \
        (mem, CL_MEM_TYPE, sizeof(mem_obj_type), &mem_obj_type, 0));

    switch (mem_obj_type)
    {
      case CL_MEM_OBJECT_BUFFER:
        return py::object(handle_from_new_ptr(
              new buffer(mem, retain)));
      case CL_MEM_OBJECT_IMAGE2D:
      case CL_MEM_OBJECT_IMAGE3D:
#if PYOPENCL_CL_VERSION >= 0x1020
      case CL_MEM_OBJECT_IMAGE2D_ARRAY:
      case CL_MEM_OBJECT_IMAGE1D:
      case CL_MEM_OBJECT_IMAGE1D_ARRAY:
      case CL_MEM_OBJECT_IMAGE1D_BUFFER:
#endif
        return py::object(handle_from_new_ptr(
              new image(mem, retain)));
      default:
        return py::object(handle_from_new_ptr(
              new memory_object(mem, retain)));
    }
  }

  inline
  py::object memory_object_from_int(intptr_t cl_mem_as_int, bool retain)
  {
    return create_mem_object_wrapper((cl_mem) cl_mem_as_int, retain);
  }


  inline
  py::object memory_object_holder::get_info(cl_mem_info param_name) const
  {
    switch (param_name)
    {
      case CL_MEM_TYPE:
        PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name,
            cl_mem_object_type);
      case CL_MEM_FLAGS:
        PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name,
            cl_mem_flags);
      case CL_MEM_SIZE:
        PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name,
            size_t);
      case CL_MEM_HOST_PTR:
        throw pyopencl::error("MemoryObject.get_info", CL_INVALID_VALUE,
            "Use MemoryObject.get_host_array to get host pointer.");
      case CL_MEM_MAP_COUNT:
        PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name,
            cl_uint);
      case CL_MEM_REFERENCE_COUNT:
        PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name,
            cl_uint);
      case CL_MEM_CONTEXT:
        PYOPENCL_GET_OPAQUE_INFO(MemObject, data(), param_name,
            cl_context, context);

#if PYOPENCL_CL_VERSION >= 0x1010
      case CL_MEM_ASSOCIATED_MEMOBJECT:
        {
          cl_mem param_value;
          PYOPENCL_CALL_GUARDED(clGetMemObjectInfo, \
              (data(), param_name, sizeof(param_value), &param_value, 0));
          if (param_value == 0)
          {
            // no associated memory object? no problem.
            return py::none();
          }

          return create_mem_object_wrapper(param_value);
        }
      case CL_MEM_OFFSET:
        PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name,
            size_t);
#endif
#if PYOPENCL_CL_VERSION >= 0x2000
      case CL_MEM_USES_SVM_POINTER:
        PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name,
            cl_bool);
#endif
#if PYOPENCL_CL_VERSION >= 0x3000
      case CL_MEM_PROPERTIES:
            {
              std::vector<cl_mem_properties> result;
              PYOPENCL_GET_VEC_INFO(MemObject, data(), param_name, result);
              PYOPENCL_RETURN_VECTOR(cl_mem_properties, result);
            }
#endif

      default:
        throw error("MemoryObjectHolder.get_info", CL_INVALID_VALUE);
    }
  }

  // FIXME: Reenable in pypy
#ifndef PYPY_VERSION
  inline
  py::object get_mem_obj_host_array(
      py::object mem_obj_py,
      py::object shape, py::object dtype,
      py::object order_py)
  {
    memory_object_holder const &mem_obj =
      py::cast<memory_object_holder const &>(mem_obj_py);
    PyArray_Descr *tp_descr;
    if (PyArray_DescrConverter(dtype.ptr(), &tp_descr) != NPY_SUCCEED)
      throw py::python_error();
    cl_mem_flags mem_flags;
    PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
            (mem_obj.data(), CL_MEM_FLAGS, sizeof(mem_flags), &mem_flags, 0));
    if (!(mem_flags & CL_MEM_USE_HOST_PTR))
      throw pyopencl::error("MemoryObject.get_host_array", CL_INVALID_VALUE,
                            "Only MemoryObject with USE_HOST_PTR "
                            "is supported.");

    std::vector<npy_intp> dims;
    try
    {
      dims.push_back(py::cast<npy_intp>(shape));
    }
    catch (py::cast_error &)
    {
      for (auto it: shape)
        dims.push_back(py::cast<npy_intp>(it));
    }

    NPY_ORDER order = NPY_CORDER;
    PyArray_OrderConverter(order_py.ptr(), &order);

    int ary_flags = 0;
    if (order == NPY_FORTRANORDER)
      ary_flags |= NPY_FARRAY;
    else if (order == NPY_CORDER)
      ary_flags |= NPY_CARRAY;
    else
      throw std::runtime_error("unrecognized order specifier");

    void *host_ptr;
    size_t mem_obj_size;
    PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
        (mem_obj.data(), CL_MEM_HOST_PTR, sizeof(host_ptr),
         &host_ptr, 0));
    PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
        (mem_obj.data(), CL_MEM_SIZE, sizeof(mem_obj_size),
         &mem_obj_size, 0));

    py::object result = py::steal<py::object>(PyArray_NewFromDescr(
        &PyArray_Type, tp_descr,
        dims.size(), &dims.front(), /*strides*/ nullptr,
        host_ptr, ary_flags, /*obj*/nullptr));
    PyArrayObject *result_arr = (PyArrayObject *) result.ptr();

    if ((size_t) PyArray_NBYTES(result_arr) > mem_obj_size)
      throw pyopencl::error("MemoryObject.get_host_array",
          CL_INVALID_VALUE,
          "Resulting array is larger than memory object.");

    PyArray_SetBaseObject(result_arr, mem_obj_py.ptr());
    Py_INCREF(mem_obj_py.ptr());

    return result;
  }
#endif

  // }}}
}

#endif

// vim: foldmethod=marker