cuda.hpp

        return result;
      }
#endif
  };

  inline
  function module::get_function(const char *name)
  {
    CUfunction func;
    CUDAPP_CALL_GUARDED(cuModuleGetFunction, (&func, m_module, name));
    return function(func, name);
  }


  // device memory ------------------------------------------------------------
  inline
  py::tuple mem_get_info()
  {
    unsigned int free, total;
    CUDAPP_CALL_GUARDED(cuMemGetInfo, (&free, &total));
    return py::make_tuple(free, total);
  }

  inline 
  CUdeviceptr mem_alloc(unsigned long bytes)
  {
    CUdeviceptr devptr;
    CUDAPP_CALL_GUARDED(cuMemAlloc, (&devptr, bytes));
    return devptr;
  }

  inline 
  void mem_free(CUdeviceptr devptr)
  {
    CUDAPP_CALL_GUARDED_CLEANUP(cuMemFree, (devptr));
  }

  class device_allocation : public boost::noncopyable, public context_dependent
  {
    private:
      bool m_valid;

    protected:
      CUdeviceptr m_devptr;

    public:
      device_allocation(CUdeviceptr devptr)
        : m_valid(true), m_devptr(devptr)
      { }

      void free()
      {
        if (m_valid)
        {
          try
          {
            scoped_context_activation ca(get_context());
            mem_free(m_devptr);
          }
          CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(device_allocation);

          release_context();
          m_valid = false;
        }
        else
          throw cuda::error("device_allocation::free", CUDA_ERROR_INVALID_HANDLE);
      }

      ~device_allocation()
      {
        if (m_valid)
          free();
      }
      
      operator CUdeviceptr() const
      { return m_devptr; }
  };

  inline unsigned int mem_alloc_pitch(
      std::auto_ptr<device_allocation> &da,
        unsigned int width, unsigned int height, unsigned int access_size)
  {
    CUdeviceptr devptr;
    unsigned int pitch;
    CUDAPP_CALL_GUARDED(cuMemAllocPitch, (&devptr, &pitch, width, height, access_size));
    da = std::auto_ptr<device_allocation>(new device_allocation(devptr));
    return pitch;
  }

  inline
  py::tuple mem_get_address_range(CUdeviceptr ptr)
  {
    CUdeviceptr base;
    unsigned int size;
    CUDAPP_CALL_GUARDED(cuMemGetAddressRange, (&base, &size, ptr));
    return py::make_tuple(base, size);
  }

  // missing: htoa, atoh, dtoh, htod

  inline
  void memcpy_dtoa(array const &ary, unsigned int index, CUdeviceptr src, unsigned int len)
  { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyDtoA, (ary.handle(), index, src, len)); }

  inline
  void memcpy_atod(CUdeviceptr dst, array const &ary, unsigned int index, unsigned int len)
  { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoD, (dst, ary.handle(), index, len)); }

  inline
  void memcpy_atoa(
      array const &dst, unsigned int dst_index, 
      array const &src, unsigned int src_index, 
      unsigned int len)
  { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoA, (dst.handle(), dst_index, src.handle(), src_index, len)); }


  // structured memcpy --------------------------------------------------------
#if PY_VERSION_HEX >= 0x02050000
  typedef Py_ssize_t PYCUDA_BUFFER_SIZE_T;
#else
  typedef int PYCUDA_BUFFER_SIZE_T;
#endif

#define MEMCPY_SETTERS \
    void set_src_host(py::object buf_py) \
    { \
      srcMemoryType = CU_MEMORYTYPE_HOST; \
      PYCUDA_BUFFER_SIZE_T len; \
      if (PyObject_AsReadBuffer(buf_py.ptr(), &srcHost, &len)) \
        throw py::error_already_set(); \
    } \
    \
    void set_src_array(array const &ary)  \
    {  \
      srcMemoryType = CU_MEMORYTYPE_ARRAY; \
      srcArray = ary.handle();  \
    } \
    \
    void set_src_device(CUdeviceptr devptr)  \
    { \
      srcMemoryType = CU_MEMORYTYPE_DEVICE; \
      srcDevice = devptr; \
    } \
    \
    void set_dst_host(py::object buf_py) \
    { \
      dstMemoryType = CU_MEMORYTYPE_HOST; \
      PYCUDA_BUFFER_SIZE_T len; \
      if (PyObject_AsWriteBuffer(buf_py.ptr(), &dstHost, &len)) \
        throw py::error_already_set(); \
    } \
    \
    void set_dst_array(array const &ary) \
    { \
      dstMemoryType = CU_MEMORYTYPE_ARRAY; \
      dstArray = ary.handle(); \
    } \
    \
    void set_dst_device(CUdeviceptr devptr)  \
    { \
      dstMemoryType = CU_MEMORYTYPE_DEVICE; \
      dstDevice = devptr; \
    }


  struct memcpy_2d : public CUDA_MEMCPY2D
  {
    memcpy_2d()
    {
      srcXInBytes = 0;
      srcY = 0;

      dstXInBytes = 0;
      dstY = 0;
    }

    MEMCPY_SETTERS;

    void execute(bool aligned) const
    {
      if (aligned)
      { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2D, (this)); }
      else
      { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2DUnaligned, (this)); }
    }

    void execute_async(const stream &s) const
    { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2DAsync, (this, s.handle())); }
  };

#if CUDA_VERSION >= 2000
  struct memcpy_3d : public CUDA_MEMCPY3D
  {
    memcpy_3d()
    {
      reserved0 = 0;
      reserved1 = 0;

      srcXInBytes = 0;
      srcY = 0;
      srcZ = 0;
      srcLOD = 0;

      dstXInBytes = 0;
      dstY = 0;
      dstZ = 0;
      dstLOD = 0;
    }

    MEMCPY_SETTERS;

    void execute() const
    {
      CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3D, (this));
    }

    void execute_async(const stream &s) const
    { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3DAsync, (this, s.handle())); }
  };
#endif


  // host memory --------------------------------------------------------------
  inline void *mem_alloc_host(unsigned int size, unsigned flags=0)
  {
    void *m_data;
#if CUDA_VERSION >= 2020
    CUDAPP_CALL_GUARDED(cuMemHostAlloc, (&m_data, size, flags));
#else
    if (flags != 0)
      throw cuda::error("mem_alloc_host", CUDA_ERROR_INVALID_VALUE,
          "nonzero flags in mem_alloc_host not allowed in CUDA 2.1 and older");
    CUDAPP_CALL_GUARDED(cuMemAllocHost, (&m_data, size));
#endif
    return m_data;
  }

  inline void mem_free_host(void *ptr)
  {
    CUDAPP_CALL_GUARDED_CLEANUP(cuMemFreeHost, (ptr));
  }


  struct host_allocation : public boost::noncopyable, public context_dependent
  {
    private:
      bool m_valid;
      void *m_data;

    public:
      host_allocation(unsigned bytesize, unsigned flags=0)
        : m_valid(true), m_data(mem_alloc_host(bytesize, flags))
      { }

      ~host_allocation()
      { 
        if (m_valid)
          free(); 
      }

      void free()
      {
        if (m_valid)
        {
          try
          {
            scoped_context_activation ca(get_context());
            mem_free_host(m_data); 
          }
          CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(host_allocation);

          release_context();
          m_valid = false;
        }
        else
          throw cuda::error("host_allocation::free", CUDA_ERROR_INVALID_HANDLE);
      }
      
      void *data()
      { return m_data; }

#if CUDA_VERSION >= 2020
      CUdeviceptr get_device_pointer()
      {
        CUdeviceptr result;
        CUDAPP_CALL_GUARDED(cuMemHostGetDevicePointer, (&result, m_data, 0));
        return result;
      }
#endif

  };


  // events -------------------------------------------------------------------
  class event : public boost::noncopyable, public context_dependent
  {
    private:
      CUevent m_event;

    public:
      event(unsigned int flags=0)
      { CUDAPP_CALL_GUARDED(cuEventCreate, (&m_event, flags)); }

      ~event()
      { 
        try
        {
          scoped_context_activation ca(get_context());
          CUDAPP_CALL_GUARDED_CLEANUP(cuEventDestroy, (m_event)); 
        }
        CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(event);
      }

      event *record(py::object stream_py)
      { 
        CUstream s_handle;
        if (stream_py.ptr() != Py_None)
        {
          const stream &s = py::extract<const stream &>(stream_py);
          s_handle = s.handle();
        }
        else
          s_handle = 0;

        CUDAPP_CALL_GUARDED(cuEventRecord, (m_event, s_handle)); 
        return this;
      }

      event *synchronize()
      { 
        CUDAPP_CALL_GUARDED_THREADED(cuEventSynchronize, (m_event)); 
        return this;
      }

      bool query() const
      { 
#ifdef TRACE_CUDA
        std::cerr << "cuEventQuery" << std::endl;
#endif
        CUresult result = cuEventQuery(m_event);
        switch (result)
        {
          case CUDA_SUCCESS: 
            return true;
          case CUDA_ERROR_NOT_READY: 
            return false;
          default:
            throw error("cuEventQuery", result);
        }
      }

      float time_since(event const &start)
      {
        float result;
        CUDAPP_CALL_GUARDED(cuEventElapsedTime, (&result, start.m_event, m_event));
        return result;
      }

      float time_till(event const &end)
      {
        float result;
        CUDAPP_CALL_GUARDED(cuEventElapsedTime, (&result, m_event, end.m_event));
        return result;
      }
  };
}


#endif