cuda.hpp

        m_array.reset();
        return byte_offset;
      }

#if CUDAPP_CUDA_VERSION >= 2020
      void set_address_2d(CUdeviceptr dptr,
          const CUDA_ARRAY_DESCRIPTOR &descr, unsigned int pitch)
      {
        CUDAPP_CALL_GUARDED(cuTexRefSetAddress2D, (m_texref, &descr, dptr, pitch));
      }
#endif

      void set_format(CUarray_format fmt, int num_packed_components)
      { CUDAPP_CALL_GUARDED(cuTexRefSetFormat, (m_texref, fmt, num_packed_components)); }

      void set_address_mode(int dim, CUaddress_mode am)
      { CUDAPP_CALL_GUARDED(cuTexRefSetAddressMode, (m_texref, dim, am)); }
      void set_filter_mode(CUfilter_mode fm)
      { CUDAPP_CALL_GUARDED(cuTexRefSetFilterMode, (m_texref, fm)); }

      void set_flags(unsigned int flags)
      { CUDAPP_CALL_GUARDED(cuTexRefSetFlags, (m_texref, flags)); }

      CUdeviceptr get_address()
      {
        CUdeviceptr result;
        CUDAPP_CALL_GUARDED(cuTexRefGetAddress, (&result, m_texref));
        return result;
      }
      array *get_array()
      {
        CUarray result;
        CUDAPP_CALL_GUARDED(cuTexRefGetArray, (&result, m_texref));
        return new array(result, false);
      }
      CUaddress_mode get_address_mode(int dim)
      {
        CUaddress_mode result;
        CUDAPP_CALL_GUARDED(cuTexRefGetAddressMode, (&result, m_texref, dim));
        return result;
      }
      CUfilter_mode get_filter_mode()
      {
        CUfilter_mode result;
        CUDAPP_CALL_GUARDED(cuTexRefGetFilterMode, (&result, m_texref));
        return result;
      }

#if CUDAPP_CUDA_VERSION >= 2000
      py::tuple get_format()
      {
        CUarray_format fmt;
        int num_channels;
        CUDAPP_CALL_GUARDED(cuTexRefGetFormat, (&fmt, &num_channels, m_texref));
        return py::make_tuple(fmt, num_channels);
      }
#endif

      unsigned int get_flags()
      {
        unsigned int result;
        CUDAPP_CALL_GUARDED(cuTexRefGetFlags, (&result, m_texref));
        return result;
      }
  };

  // }}}

  // {{{ surface reference
#if CUDAPP_CUDA_VERSION >= 3010
  class module;

  class surface_reference : public  boost::noncopyable
  {
    private:
      CUsurfref m_surfref;

      // life support for array and module
      boost::shared_ptr<array> m_array;
      boost::shared_ptr<module> m_module;

    public:
      surface_reference(CUsurfref sr)
        : m_surfref(sr)
      { }

      void set_module(boost::shared_ptr<module> mod)
      { m_module = mod; }

      CUsurfref handle() const
      { return m_surfref; }

      void set_array(boost::shared_ptr<array> ary, unsigned int flags)
      {
        CUDAPP_CALL_GUARDED(cuSurfRefSetArray, (m_surfref, ary->handle(), flags));
        m_array = ary;
      }

      array *get_array()
      {
        CUarray result;
        CUDAPP_CALL_GUARDED(cuSurfRefGetArray, (&result, m_surfref));
        return new array(result, false);
      }
  };
#endif

  // }}}

  // {{{ module
  class function;

  class module : public boost::noncopyable, public context_dependent
  {
    private:
      CUmodule m_module;

    public:
      module(CUmodule mod)
        : m_module(mod)
      { }

      ~module()
      {
        try
        {
          scoped_context_activation ca(get_context());
          CUDAPP_CALL_GUARDED_CLEANUP(cuModuleUnload, (m_module));
        }
        CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(module);
      }

      CUmodule handle() const
      { return m_module; }

      function get_function(const char *name);
      py::tuple get_global(const char *name)
      {
        CUdeviceptr devptr;
        pycuda_size_t bytes;
        CUDAPP_CALL_GUARDED(cuModuleGetGlobal, (&devptr, &bytes, m_module, name));
        return py::make_tuple(devptr, bytes);
      }
  };

  inline
  module *module_from_file(const char *filename)
  {
    CUmodule mod;
    CUDAPP_CALL_GUARDED(cuModuleLoad, (&mod, filename));
    return new module(mod);
  }

  inline
  texture_reference *module_get_texref(
      boost::shared_ptr<module> mod, const char *name)
  {
    CUtexref tr;
    CUDAPP_CALL_GUARDED(cuModuleGetTexRef, (&tr, mod->handle(), name));
    std::auto_ptr<texture_reference> result(
        new texture_reference(tr, false));
    result->set_module(mod);
    return result.release();
  }

#if CUDAPP_CUDA_VERSION >= 3010
  inline
  surface_reference *module_get_surfref(
      boost::shared_ptr<module> mod, const char *name)
  {
    CUsurfref sr;
    CUDAPP_CALL_GUARDED(cuModuleGetSurfRef, (&sr, mod->handle(), name));
    std::auto_ptr<surface_reference> result(
        new surface_reference(sr));
    result->set_module(mod);
    return result.release();
  }
#endif

  // }}}

  // {{{ function
  class function
  {
    private:
      CUfunction m_function;
      std::string m_symbol;

    public:
      function(CUfunction func, std::string const &sym)
        : m_function(func), m_symbol(sym)
      { }

      void set_block_shape(int x, int y, int z)
      {
        CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
            cuFuncSetBlockShape, (m_function, x, y, z), m_symbol);
      }
      void set_shared_size(unsigned int bytes)
      {
        CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
            cuFuncSetSharedSize, (m_function, bytes), m_symbol);
      }

      void param_set_size(unsigned int bytes)
      {
        CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
            cuParamSetSize, (m_function, bytes), m_symbol);
      }
      void param_set(int offset, unsigned int value)
      {
        CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
            cuParamSeti, (m_function, offset, value), m_symbol);
      }
      void param_set(int offset, float value)
      {
        CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
          cuParamSetf, (m_function, offset, value), m_symbol);
      }
      void param_setv(int offset, void *buf, size_t len)
      {
        // maybe the unsigned int will change, it does not seem right
        CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
          cuParamSetv, (m_function, offset, buf, (unsigned int) len), m_symbol);
      }
      void param_set_texref(const texture_reference &tr)
      {
        CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(cuParamSetTexRef, (m_function,
            CU_PARAM_TR_DEFAULT, tr.handle()), m_symbol);
      }

      void launch()
      {
        CUDAPP_CALL_GUARDED_THREADED_WITH_TRACE_INFO(
            cuLaunch, (m_function), m_symbol);
      }
      void launch_grid(int grid_width, int grid_height)
      {
        CUDAPP_CALL_GUARDED_THREADED_WITH_TRACE_INFO(
          cuLaunchGrid, (m_function, grid_width, grid_height), m_symbol);
      }
      void launch_grid_async(int grid_width, int grid_height, const stream &s)
      {
        CUDAPP_CALL_GUARDED_THREADED_WITH_TRACE_INFO(
            cuLaunchGridAsync, (m_function, grid_width, grid_height, s.handle()),
            m_symbol);
      }

#if CUDAPP_CUDA_VERSION >= 2020
      int get_attribute(CUfunction_attribute attr) const
      {
        int result;
        CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
            cuFuncGetAttribute, (&result, attr, m_function), m_symbol);
        return result;
      }
#endif

#if CUDAPP_CUDA_VERSION >= 3000 && defined(CUDAPP_POST_30_BETA)
      void set_cache_config(CUfunc_cache fc)
      {
        CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
            cuFuncSetCacheConfig, (m_function, fc), m_symbol);
      }
#endif

#if CUDAPP_CUDA_VERSION >= 4000
      void launch_kernel(py::tuple grid_dim_py, py::tuple block_dim_py,
          py::object parameter_buffer,
          unsigned shared_mem_bytes, py::object stream_py)
      {
        const unsigned axis_count = 3;
        unsigned grid_dim[axis_count];
        unsigned block_dim[axis_count];

        for (unsigned i = 0; i < axis_count; ++i)
        {
          grid_dim[i] = 1;
          block_dim[i] = 1;
        }

        unsigned gd_length = py::len(grid_dim_py);
        if (gd_length > axis_count)
          throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE,
              "too many grid dimensions in kernel launch");

        for (unsigned i = 0; i < gd_length; ++i)
          grid_dim[i] = py::extract<unsigned>(grid_dim_py[i]);

        unsigned bd_length = py::len(block_dim_py);
        if (bd_length > axis_count)
          throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE,
              "too many block dimensions in kernel launch");

        for (unsigned i = 0; i < bd_length; ++i)
          block_dim[i] = py::extract<unsigned>(block_dim_py[i]);

        CUstream s_handle;
        if (stream_py.ptr() != Py_None)
        {
          const stream &s = py::extract<const stream &>(stream_py);
          s_handle = s.handle();
        }
        else
          s_handle = 0;

        const void *par_buf;
        PYCUDA_BUFFER_SIZE_T py_par_len;
        if (PyObject_AsReadBuffer(parameter_buffer.ptr(), &par_buf, &py_par_len))
          throw py::error_already_set();
        size_t par_len = py_par_len;

        void *config[] = {
          CU_LAUNCH_PARAM_BUFFER_POINTER, const_cast<void *>(par_buf),
          CU_LAUNCH_PARAM_BUFFER_SIZE, &par_len,
          CU_LAUNCH_PARAM_END
        };

        CUDAPP_CALL_GUARDED(
            cuLaunchKernel, (m_function, 
              grid_dim[0], grid_dim[1], grid_dim[2],
              block_dim[0], block_dim[1], block_dim[2],
              shared_mem_bytes, s_handle, 0, config
              ));
      }
#endif
  };

  inline
  function module::get_function(const char *name)
  {
    CUfunction func;
    CUDAPP_CALL_GUARDED(cuModuleGetFunction, (&func, m_module, name));
    return function(func, name);
  }

  // }}}

  // {{{ device memory
  inline
  py::tuple mem_get_info()
  {
    pycuda_size_t free, total;
    CUDAPP_CALL_GUARDED(cuMemGetInfo, (&free, &total));
    return py::make_tuple(free, total);
  }

  inline
  CUdeviceptr mem_alloc(size_t bytes)
  {
    CUdeviceptr devptr;
    CUDAPP_CALL_GUARDED(cuMemAlloc, (&devptr, bytes));
    return devptr;
  }

  inline
  void mem_free(CUdeviceptr devptr)
  {
    CUDAPP_CALL_GUARDED_CLEANUP(cuMemFree, (devptr));
  }

  // A class the user can override to make device_allocation-
  // workalikes.

  class pointer_holder_base
  {
    public:
      virtual ~pointer_holder_base() { }
      virtual CUdeviceptr get_pointer() = 0;
      operator CUdeviceptr()
      { return get_pointer(); }
  };

  class device_allocation : public boost::noncopyable, public context_dependent
  {
    private:
      bool m_valid;

    protected:
      CUdeviceptr m_devptr;

    public:
      device_allocation(CUdeviceptr devptr)
        : m_valid(true), m_devptr(devptr)
      { }

      void free()
      {
        if (m_valid)
        {
          try
          {
            scoped_context_activation ca(get_context());
            mem_free(m_devptr);
          }
          CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(device_allocation);

          release_context();
          m_valid = false;
        }
        else
          throw pycuda::error("device_allocation::free", CUDA_ERROR_INVALID_HANDLE);
      }

      ~device_allocation()
      {
        if (m_valid)
          free();
      }

      operator CUdeviceptr() const
      { return m_devptr; }
  };

  inline Py_ssize_t mem_alloc_pitch(
      std::auto_ptr<device_allocation> &da,
        unsigned int width, unsigned int height, unsigned int access_size)
  {
    CUdeviceptr devptr;
    pycuda_size_t pitch;
    CUDAPP_CALL_GUARDED(cuMemAllocPitch, (&devptr, &pitch, width, height, access_size));
    da = std::auto_ptr<device_allocation>(new device_allocation(devptr));
    return pitch;
  }

  inline
  py::tuple mem_get_address_range(CUdeviceptr ptr)
  {
    CUdeviceptr base;
    pycuda_size_t size;
    CUDAPP_CALL_GUARDED(cuMemGetAddressRange, (&base, &size, ptr));
    return py::make_tuple(base, size);
  }

  inline
  void memcpy_dtoa(array const &ary, unsigned int index, CUdeviceptr src, unsigned int len)
  { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyDtoA, (ary.handle(), index, src, len)); }

  inline
  void memcpy_atod(CUdeviceptr dst, array const &ary, unsigned int index, unsigned int len)
  { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoD, (dst, ary.handle(), index, len)); }

  inline
  void memcpy_atoa(
      array const &dst, unsigned int dst_index,
      array const &src, unsigned int src_index,
      unsigned int len)
  { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoA, (dst.handle(), dst_index, src.handle(), src_index, len)); }

  // }}}

  // {{{ structured memcpy

#define MEMCPY_SETTERS \
    void set_src_host(py::object buf_py) \
    { \
      srcMemoryType = CU_MEMORYTYPE_HOST; \
      PYCUDA_BUFFER_SIZE_T len; \
      if (PyObject_AsReadBuffer(buf_py.ptr(), &srcHost, &len)) \
        throw py::error_already_set(); \
    } \
    \
    void set_src_array(array const &ary)  \
    {  \
      srcMemoryType = CU_MEMORYTYPE_ARRAY; \
      srcArray = ary.handle();  \
    } \
    \
    void set_src_device(CUdeviceptr devptr)  \
    { \
      srcMemoryType = CU_MEMORYTYPE_DEVICE; \
      srcDevice = devptr; \
    } \
    \
    void set_dst_host(py::object buf_py) \
    { \
      dstMemoryType = CU_MEMORYTYPE_HOST; \
      PYCUDA_BUFFER_SIZE_T len; \
      if (PyObject_AsWriteBuffer(buf_py.ptr(), &dstHost, &len)) \
        throw py::error_already_set(); \
    } \
    \
    void set_dst_array(array const &ary) \
    { \
      dstMemoryType = CU_MEMORYTYPE_ARRAY; \
      dstArray = ary.handle(); \
    } \
    \
    void set_dst_device(CUdeviceptr devptr)  \
    { \
      dstMemoryType = CU_MEMORYTYPE_DEVICE; \
      dstDevice = devptr; \
    }


  struct memcpy_2d : public CUDA_MEMCPY2D
  {
    memcpy_2d()
    {
      srcXInBytes = 0;
      srcY = 0;

      dstXInBytes = 0;
      dstY = 0;
    }

    MEMCPY_SETTERS;

    void execute(bool aligned=false) const
    {
      if (aligned)
      { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2D, (this)); }
      else
      { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2DUnaligned, (this)); }
    }

    void execute_async(const stream &s) const
    { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2DAsync, (this, s.handle())); }
  };

#if CUDAPP_CUDA_VERSION >= 2000
  struct memcpy_3d : public CUDA_MEMCPY3D
  {
    memcpy_3d()
    {
      reserved0 = 0;
      reserved1 = 0;

      srcXInBytes = 0;
      srcY = 0;
      srcZ = 0;
      srcLOD = 0;

      dstXInBytes = 0;
      dstY = 0;
      dstZ = 0;
      dstLOD = 0;
    }

    MEMCPY_SETTERS;

    void execute() const
    {
      CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3D, (this));
    }

    void execute_async(const stream &s) const
    { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3DAsync, (this, s.handle())); }
  };
#endif

#if CUDAPP_CUDA_VERSION >= 4000
  struct memcpy_3d_peer : public CUDA_MEMCPY3D_PEER
  {
    memcpy_3d_peer()
    {
      srcXInBytes = 0;
      srcY = 0;
      srcZ = 0;
      srcLOD = 0;

      dstXInBytes = 0;
      dstY = 0;
      dstZ = 0;
      dstLOD = 0;
    }

    MEMCPY_SETTERS;

    void set_src_context(context const &ctx)
    {
      srcContext = ctx.handle();
    }

    void set_dst_context(context const &ctx)
    {
      dstContext = ctx.handle();
    }

    void execute() const
    {
      CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3DPeer, (this));
    }

    void execute_async(const stream &s) const
    { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3DPeerAsync, (this, s.handle())); }
  };
#endif

  // }}}

  // {{{ host memory
  inline void *mem_host_alloc(size_t size, unsigned flags=0)
  {
    void *m_data;
#if CUDAPP_CUDA_VERSION >= 2020
    CUDAPP_CALL_GUARDED(cuMemHostAlloc, (&m_data, size, flags));
#else
    if (flags != 0)
      throw pycuda::error("mem_host_alloc", CUDA_ERROR_INVALID_VALUE,
          "nonzero flags in mem_host_alloc not allowed in CUDA 2.1 and older");
    CUDAPP_CALL_GUARDED(cuMemAllocHost, (&m_data, size));
#endif
    return m_data;
  }

  inline void mem_host_free(void *ptr)
  {
    CUDAPP_CALL_GUARDED_CLEANUP(cuMemFreeHost, (ptr));
  }

#if CUDAPP_CUDA_VERSION >= 4000
  inline void *mem_host_register(void *ptr, size_t bytes, unsigned int flags=0)
  {
    CUDAPP_CALL_GUARDED(cuMemHostRegister, (ptr, bytes, flags));
    return ptr;
  }

  inline void mem_host_unregister(void *ptr)
  {
    CUDAPP_CALL_GUARDED_CLEANUP(cuMemHostUnregister, (ptr));
  }
#endif

  inline void *aligned_malloc(size_t size, size_t alignment)
  {
    // alignment must be a power of two.
    if ((alignment & (alignment - 1)) != 0)
      throw pycuda::error("aligned_malloc", CUDA_ERROR_INVALID_VALUE,
          "alignment must be a power of two");

    if (alignment == 0)
      throw pycuda::error("aligned_malloc", CUDA_ERROR_INVALID_VALUE,
          "alignment must non-zero");

    void *p = malloc(size + (alignment - 1));
    if (!p)
      throw pycuda::error("aligned_malloc", CUDA_ERROR_OUT_OF_MEMORY,
          "aligned malloc failed");

    return (void *)((((ptrdiff_t)(p)) + (alignment-1)) & -alignment);
  }


  struct host_pointer : public boost::noncopyable, public context_dependent
  {
    protected:
      bool m_valid;
      void *m_data;

    public:
      host_pointer()
        : m_valid(false)
      { }

      host_pointer(void *ptr)
        : m_valid(true), m_data(ptr)
      { }

      virtual ~host_pointer()
      {
        if (m_valid)
          free();
      }

      virtual void free()
      { }

      void *data()
      { return m_data; }

#if CUDAPP_CUDA_VERSION >= 2020
      CUdeviceptr get_device_pointer()
      {
        CUdeviceptr result;
        CUDAPP_CALL_GUARDED(cuMemHostGetDevicePointer, (&result, m_data, 0));
        return result;
      }
#endif

  };

  struct pagelocked_host_allocation : public host_pointer
  {
    public:
      pagelocked_host_allocation(size_t bytesize, unsigned flags=0)
        : host_pointer(mem_host_alloc(bytesize, flags))
      { }

      void free()
      {
        if (m_valid)
        {
          try
          {
            scoped_context_activation ca(get_context());
            mem_host_free(m_data);
          }
          CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(pagelocked_host_allocation);

          release_context();
          m_valid = false;
        }
        else
          throw pycuda::error("pagelocked_host_allocation::free", CUDA_ERROR_INVALID_HANDLE);
      }

#if CUDAPP_CUDA_VERSION >= 3020
      unsigned int get_flags()
      {
        unsigned int flags;
        CUDAPP_CALL_GUARDED(cuMemHostGetFlags, (&flags, m_data));
        return flags;
      }
#endif
  };

  struct aligned_host_allocation : public host_pointer
  {
    public:
      aligned_host_allocation(size_t size, size_t alignment)
        : host_pointer(aligned_malloc(size, alignment))
      { }

      void free()
      {
        if (m_valid)
        {
          ::free(m_data);
        }
        else
          throw pycuda::error("aligned_host_allocation::free", CUDA_ERROR_INVALID_HANDLE);
      }
  };

#if CUDAPP_CUDA_VERSION >= 4000
  struct registered_host_memory : public host_pointer
  {
    private:
      py::object m_base;

    public:
      registered_host_memory(void *p, size_t bytes, unsigned int flags=0, 
          py::object base=py::object())
        : host_pointer(mem_host_register(p, bytes, flags)), m_base(base)
      {
      }

      void free()
      {
        if (m_valid)
        {
          try
          {
            scoped_context_activation ca(get_context());
            mem_host_unregister(m_data);
          }
          CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(host_allocation);

          release_context();
          m_valid = false;
        }
        else
          throw pycuda::error("registered_host_memory::free", CUDA_ERROR_INVALID_HANDLE);
      }

      py::object base() const
      {
        return m_base;
      }
  };
#endif

  // }}}

  // {{{ event
  class event : public boost::noncopyable, public context_dependent
  {
    private:
      CUevent m_event;

    public:
      event(unsigned int flags=0)
      { CUDAPP_CALL_GUARDED(cuEventCreate, (&m_event, flags)); }

      ~event()
      {
        try
        {
          scoped_context_activation ca(get_context());
          CUDAPP_CALL_GUARDED_CLEANUP(cuEventDestroy, (m_event));
        }
        CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(event);
      }

      event *record(py::object stream_py)
      {
        CUstream s_handle;
        if (stream_py.ptr() != Py_None)
        {
          const stream &s = py::extract<const stream &>(stream_py);
          s_handle = s.handle();
        }
        else
          s_handle = 0;

        CUDAPP_CALL_GUARDED(cuEventRecord, (m_event, s_handle));
        return this;
      }

      CUevent handle() const
      { return m_event; }

      event *synchronize()
      {
        CUDAPP_CALL_GUARDED_THREADED(cuEventSynchronize, (m_event));
        return this;
      }

      bool query() const
      {
        CUDAPP_PRINT_CALL_TRACE("cuEventQuery");

        CUresult result = cuEventQuery(m_event);
        switch (result)
        {
          case CUDA_SUCCESS:
            return true;
          case CUDA_ERROR_NOT_READY:
            return false;
          default:
            CUDAPP_PRINT_ERROR_TRACE("cuEventQuery", result);
            throw error("cuEventQuery", result);
        }
      }

      float time_since(event const &start)
      {
        float result;
        CUDAPP_CALL_GUARDED(cuEventElapsedTime, (&result, start.m_event, m_event));
        return result;
      }

      float time_till(event const &end)
      {
        float result;
        CUDAPP_CALL_GUARDED(cuEventElapsedTime, (&result, m_event, end.m_event));
        return result;
      }
  };

#if CUDAPP_CUDA_VERSION >= 3020
  inline void stream::wait_for_event(const event &evt)
  {
    CUDAPP_CALL_GUARDED(cuStreamWaitEvent, (m_stream, evt.handle(), 0));
  }
#endif

  // }}}

  // {{{ profiler
#if CUDAPP_CUDA_VERSION >= 4000
  inline void initialize_profiler(
      const char *config_file,
      const char *output_file,
      CUoutput_mode output_mode)
  {
    CUDAPP_CALL_GUARDED(cuProfilerInitialize, (config_file, output_file, output_mode));
  }

  inline void start_profiler()
  {
    CUDAPP_CALL_GUARDED(cuProfilerStart, ());
  }

  inline void stop_profiler()
  {
    CUDAPP_CALL_GUARDED(cuProfilerStart, ());
  }
#endif
  // }}}
}


#endif
// vim: foldmethod=marker