Skip to content
Snippets Groups Projects
wrap_cl.hpp 151 KiB
Newer Older
        : m_program(prog), m_program_kind(progkind)
      {
        if (retain)
          PYOPENCL_CALL_GUARDED(clRetainProgram, (prog));
      }

      ~program()
      {
        PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseProgram, (m_program));
      }

      cl_program data() const
      {
        return m_program;
      }

      program_kind_type kind() const
      {
        return m_program_kind;
      }

      PYOPENCL_EQUALITY_TESTS(program);

      py::object get_info(cl_program_info param_name) const
      {
        switch (param_name)
        {
          case CL_PROGRAM_REFERENCE_COUNT:
            PYOPENCL_GET_TYPED_INFO(Program, m_program, param_name,
                cl_uint);
          case CL_PROGRAM_CONTEXT:
            PYOPENCL_GET_OPAQUE_INFO(Program, m_program, param_name,
                cl_context, context);
          case CL_PROGRAM_NUM_DEVICES:
            PYOPENCL_GET_TYPED_INFO(Program, m_program, param_name, cl_uint);
          case CL_PROGRAM_DEVICES:
            {
              std::vector<cl_device_id> result;
              PYOPENCL_GET_VEC_INFO(Program, m_program, param_name, result);

              py::list py_result;
              for (cl_device_id did: result)
                py_result.append(handle_from_new_ptr(
                      new pyopencl::device(did)));
              return py_result;
            }
          case CL_PROGRAM_SOURCE:
            PYOPENCL_GET_STR_INFO(Program, m_program, param_name);
          case CL_PROGRAM_BINARY_SIZES:
            {
              std::vector<size_t> result;
              PYOPENCL_GET_VEC_INFO(Program, m_program, param_name, result);
              PYOPENCL_RETURN_VECTOR(size_t, result);
            }
          case CL_PROGRAM_BINARIES:
            // {{{
            {
              std::vector<size_t> sizes;
              PYOPENCL_GET_VEC_INFO(Program, m_program, CL_PROGRAM_BINARY_SIZES, sizes);

              size_t total_size = std::accumulate(sizes.begin(), sizes.end(), 0);

              std::unique_ptr<unsigned char []> result(
                  new unsigned char[total_size]);
              std::vector<unsigned char *> result_ptrs;

              unsigned char *ptr = result.get();
              for (unsigned i = 0; i < sizes.size(); ++i)
              {
                result_ptrs.push_back(ptr);
                ptr += sizes[i];
              }

              PYOPENCL_CALL_GUARDED(clGetProgramInfo,
                  (m_program, param_name, sizes.size()*sizeof(unsigned char *),
Andreas Klöckner's avatar
Andreas Klöckner committed
                   result_ptrs.empty( ) ? nullptr : &result_ptrs.front(), 0)); \

              py::list py_result;
              ptr = result.get();
              for (unsigned i = 0; i < sizes.size(); ++i)
              {
                py::object binary_pyobj(
                    py::reinterpret_steal<py::object>(
#if PY_VERSION_HEX >= 0x03000000
                    PyBytes_FromStringAndSize(
                      reinterpret_cast<char *>(ptr), sizes[i])
#else
                    PyString_FromStringAndSize(
                      reinterpret_cast<char *>(ptr), sizes[i])
#endif
                py_result.append(binary_pyobj);
                ptr += sizes[i];
              }
              return py_result;
            }
            // }}}
#if PYOPENCL_CL_VERSION >= 0x1020
          case CL_PROGRAM_NUM_KERNELS:
            PYOPENCL_GET_TYPED_INFO(Program, m_program, param_name,
                size_t);
          case CL_PROGRAM_KERNEL_NAMES:
            PYOPENCL_GET_STR_INFO(Program, m_program, param_name);
#endif
#if PYOPENCL_CL_VERSION >= 0x2010
          case CL_PROGRAM_IL:
            PYOPENCL_GET_STR_INFO(Program, m_program, param_name);
#endif
#if PYOPENCL_CL_VERSION >= 0x2020
          case CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT:
          case CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT:
            PYOPENCL_GET_TYPED_INFO(Program, m_program, param_name, cl_bool);
#endif

          default:
            throw error("Program.get_info", CL_INVALID_VALUE);
        }
      }

      py::object get_build_info(
          device const &dev,
          cl_program_build_info param_name) const
      {
        switch (param_name)
        {
#define PYOPENCL_FIRST_ARG m_program, dev.data() // hackety hack
          case CL_PROGRAM_BUILD_STATUS:
            PYOPENCL_GET_TYPED_INFO(ProgramBuild,
                PYOPENCL_FIRST_ARG, param_name,
                cl_build_status);
          case CL_PROGRAM_BUILD_OPTIONS:
          case CL_PROGRAM_BUILD_LOG:
            PYOPENCL_GET_STR_INFO(ProgramBuild,
                PYOPENCL_FIRST_ARG, param_name);
#if PYOPENCL_CL_VERSION >= 0x1020
          case CL_PROGRAM_BINARY_TYPE:
            PYOPENCL_GET_TYPED_INFO(ProgramBuild,
                PYOPENCL_FIRST_ARG, param_name,
                cl_program_binary_type);
#endif
Andreas Klöckner's avatar
Andreas Klöckner committed
#if PYOPENCL_CL_VERSION >= 0x2000
          case CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE:
            PYOPENCL_GET_TYPED_INFO(ProgramBuild,
Andreas Klöckner's avatar
Andreas Klöckner committed
                PYOPENCL_FIRST_ARG, param_name,
                size_t);
#endif
#undef PYOPENCL_FIRST_ARG

          default:
            throw error("Program.get_build_info", CL_INVALID_VALUE);
        }
      }

      void build(std::string options, py::object py_devices)
      {
        PYOPENCL_PARSE_PY_DEVICES;

        PYOPENCL_CALL_GUARDED_THREADED(clBuildProgram,
            (m_program, num_devices, devices,
             options.c_str(), 0 ,0));
      }

#if PYOPENCL_CL_VERSION >= 0x1020
      void compile(std::string options, py::object py_devices,
          py::object py_headers)
      {
        PYOPENCL_PARSE_PY_DEVICES;

        // {{{ pick apart py_headers
        // py_headers is a list of tuples *(name, program)*

        std::vector<std::string> header_names;
        std::vector<cl_program> programs;
        for (py::handle name_hdr_tup_py: py_headers)
          py::tuple name_hdr_tup = py::reinterpret_borrow<py::tuple>(name_hdr_tup_py);
          if (py::len(name_hdr_tup) != 2)
            throw error("Program.compile", CL_INVALID_VALUE,
                "epxected (name, header) tuple in headers list");
          std::string name = (name_hdr_tup[0]).cast<std::string>();
          program &prg = (name_hdr_tup[1]).cast<program &>();

          header_names.push_back(name);
          programs.push_back(prg.data());
        }

        std::vector<const char *> header_name_ptrs;
        for (std::string const &name: header_names)
          header_name_ptrs.push_back(name.c_str());

        // }}}

        PYOPENCL_CALL_GUARDED_THREADED(clCompileProgram,
            (m_program, num_devices, devices,
             options.c_str(), header_names.size(),
Andreas Klöckner's avatar
Andreas Klöckner committed
             programs.empty() ? nullptr : &programs.front(),
             header_name_ptrs.empty() ? nullptr : &header_name_ptrs.front(),

#if PYOPENCL_CL_VERSION >= 0x2020
      void set_specialization_constant(cl_uint spec_id, py::object py_buffer)
      {
        py_buffer_wrapper bufwrap;
        bufwrap.get(py_buffer.ptr(), PyBUF_ANY_CONTIGUOUS);
        PYOPENCL_CALL_GUARDED(clSetProgramSpecializationConstant,
            (m_program, spec_id, bufwrap.m_buf.len, bufwrap.m_buf.buf));
      }
#endif
  };




  inline
  program *create_program_with_source(
      context &ctx,
      std::string const &src)
  {
    const char *string = src.c_str();
    size_t length = src.size();

    cl_int status_code;
    PYOPENCL_PRINT_CALL_TRACE("clCreateProgramWithSource");
    cl_program result = clCreateProgramWithSource(
        ctx.data(), 1, &string, &length, &status_code);
    if (status_code != CL_SUCCESS)
      throw pyopencl::error("clCreateProgramWithSource", status_code);

    try
    {
      return new program(result, false, program::KND_SOURCE);
    }
    catch (...)
    {
      clReleaseProgram(result);
      throw;
    }
  }





  inline
  program *create_program_with_binary(
      context &ctx,
      py::sequence py_devices,
      py::sequence py_binaries)
  {
    std::vector<cl_device_id> devices;
    std::vector<const unsigned char *> binaries;
    std::vector<size_t> sizes;

    size_t num_devices = len(py_devices);
    if (len(py_binaries) != num_devices)
      throw error("create_program_with_binary", CL_INVALID_VALUE,
          "device and binary counts don't match");

    for (size_t i = 0; i < num_devices; ++i)
    {
      devices.push_back(
          (py_devices[i]).cast<device const &>().data());
      const void *buf;
      PYOPENCL_BUFFER_SIZE_T len;

      py_buffer_wrapper buf_wrapper;

      buf_wrapper.get(py::object(py_binaries[i]).ptr(), PyBUF_ANY_CONTIGUOUS);

      buf = buf_wrapper.m_buf.buf;
      len = buf_wrapper.m_buf.len;

      binaries.push_back(reinterpret_cast<const unsigned char *>(buf));
      sizes.push_back(len);
    }

    PYOPENCL_STACK_CONTAINER(cl_int, binary_statuses, num_devices);

    cl_int status_code;
    PYOPENCL_PRINT_CALL_TRACE("clCreateProgramWithBinary");
    cl_program result = clCreateProgramWithBinary(
        ctx.data(), num_devices,
Andreas Klöckner's avatar
Andreas Klöckner committed
        devices.empty( ) ? nullptr : &devices.front(),
        sizes.empty( ) ? nullptr : &sizes.front(),
        binaries.empty( ) ? nullptr : &binaries.front(),
        &status_code);
    if (status_code != CL_SUCCESS)
      throw pyopencl::error("clCreateProgramWithBinary", status_code);

    /*
    for (int i = 0; i < num_devices; ++i)
      printf("%d:%d\n", i, binary_statuses[i]);
      */

    try
    {
      return new program(result, false, program::KND_BINARY);
    }
    catch (...)
    {
      clReleaseProgram(result);
      throw;
    }
  }



#if (PYOPENCL_CL_VERSION >= 0x1020) || \
      ((PYOPENCL_CL_VERSION >= 0x1030) && defined(__APPLE__))
  inline
  program *create_program_with_built_in_kernels(
      context &ctx,
      py::object py_devices,
      std::string const &kernel_names)
  {
    PYOPENCL_PARSE_PY_DEVICES;

    cl_int status_code;
    PYOPENCL_PRINT_CALL_TRACE("clCreateProgramWithBuiltInKernels");
    cl_program result = clCreateProgramWithBuiltInKernels(
        ctx.data(), num_devices, devices,
        kernel_names.c_str(), &status_code);
    if (status_code != CL_SUCCESS)
      throw pyopencl::error("clCreateProgramWithBuiltInKernels", status_code);

    try
    {
      return new program(result, false);
    }
    catch (...)
    {
      clReleaseProgram(result);
      throw;
    }
  }
#endif



#if (PYOPENCL_CL_VERSION >= 0x2010)
  inline
  program *create_program_with_il(
      context &ctx,
      std::string const &src)
  {
    cl_int status_code;
    PYOPENCL_PRINT_CALL_TRACE("clCreateProgramWithIL");
    cl_program result = clCreateProgramWithIL(
        ctx.data(), src.c_str(), src.size(), &status_code);
    if (status_code != CL_SUCCESS)
      throw pyopencl::error("clCreateProgramWithIL", status_code);

    try
    {
      return new program(result, false, program::KND_IL);
    }
    catch (...)
    {
      clReleaseProgram(result);
      throw;
    }
  }
#endif





#if PYOPENCL_CL_VERSION >= 0x1020
  inline
  program *link_program(
      context &ctx,
      py::object py_programs,
      std::string const &options,
      py::object py_devices
      )
  {
    PYOPENCL_PARSE_PY_DEVICES;

    std::vector<cl_program> programs;
    for (py::handle py_prg: py_programs)
      program &prg = (py_prg).cast<program &>();
      programs.push_back(prg.data());
    }

    cl_int status_code;
    PYOPENCL_PRINT_CALL_TRACE("clLinkProgram");
    cl_program result = clLinkProgram(
        ctx.data(), num_devices, devices,
        options.c_str(),
        programs.size(),
Andreas Klöckner's avatar
Andreas Klöckner committed
        programs.empty() ? nullptr : &programs.front(),
        0, 0,
        &status_code);

    if (status_code != CL_SUCCESS)
      throw pyopencl::error("clLinkProgram", result, status_code);

    try
    {
      return new program(result, false);
    }
    catch (...)
    {
      clReleaseProgram(result);
      throw;
    }
  }

#endif


#if PYOPENCL_CL_VERSION >= 0x1020
  inline
  void unload_platform_compiler(platform &plat)
  {
    PYOPENCL_CALL_GUARDED(clUnloadPlatformCompiler, (plat.data()));
  }
#endif

  // }}}

  // {{{ kernel
  class local_memory
  {
    private:
      size_t m_size;

    public:
      local_memory(size_t size)
        : m_size(size)
      { }

      size_t size() const
      { return m_size; }
  };




  class kernel : noncopyable
  {
    private:
      cl_kernel m_kernel;

    public:
      kernel(cl_kernel knl, bool retain)
        : m_kernel(knl)
      {
        if (retain)
          PYOPENCL_CALL_GUARDED(clRetainKernel, (knl));
      }

      kernel(program const &prg, std::string const &kernel_name)
      {
        cl_int status_code;

        PYOPENCL_PRINT_CALL_TRACE("clCreateKernel");
        m_kernel = clCreateKernel(prg.data(), kernel_name.c_str(),
            &status_code);
        if (status_code != CL_SUCCESS)
          throw pyopencl::error("clCreateKernel", status_code);
      }

      ~kernel()
      {
        PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseKernel, (m_kernel));
      }

      cl_kernel data() const
      {
        return m_kernel;
      }

      PYOPENCL_EQUALITY_TESTS(kernel);

Andreas Klöckner's avatar
Andreas Klöckner committed
#if PYOPENCL_CL_VERSION >= 0x2010
      kernel *clone()
      {
        cl_int status_code;

        PYOPENCL_PRINT_CALL_TRACE("clCloneKernel");
        cl_kernel result = clCloneKernel(m_kernel, &status_code);
        if (status_code != CL_SUCCESS)
          throw pyopencl::error("clCloneKernel", status_code);

        try
        {
          return new kernel(result, /* retain */ false);
        }
        catch (...)
        {
          PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseKernel, (result));
          throw;
        }
      }
#endif

      void set_arg_null(cl_uint arg_index)
      {
        cl_mem m = 0;
        PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index,
              sizeof(cl_mem), &m));
      }

      void set_arg_mem(cl_uint arg_index, memory_object_holder &moh)
      {
        cl_mem m = moh.data();
        PYOPENCL_CALL_GUARDED(clSetKernelArg,
            (m_kernel, arg_index, sizeof(cl_mem), &m));
      }

      void set_arg_local(cl_uint arg_index, local_memory const &loc)
      {
        PYOPENCL_CALL_GUARDED(clSetKernelArg,
            (m_kernel, arg_index, loc.size(), 0));
      }

      void set_arg_sampler(cl_uint arg_index, sampler const &smp)
      {
        cl_sampler s = smp.data();
        PYOPENCL_CALL_GUARDED(clSetKernelArg,
            (m_kernel, arg_index, sizeof(cl_sampler), &s));
      }

      void set_arg_command_queue(cl_uint arg_index, command_queue const &queue)
      {
        cl_command_queue q = queue.data();
        PYOPENCL_CALL_GUARDED(clSetKernelArg,
            (m_kernel, arg_index, sizeof(cl_command_queue), &q));
      }

      void set_arg_buf_pack(cl_uint arg_index, py::handle py_typechar, py::handle obj)
      {
        std::string typechar_str(py::cast<std::string>(py_typechar));
        if (typechar_str.size() != 1)
          throw error("Kernel.set_arg_buf_pack", CL_INVALID_VALUE,
              "type char argument must have exactly one character");

        char typechar = typechar_str[0];

#define PYOPENCL_KERNEL_PACK_AND_SET_ARG(TYPECH_VAL, TYPE) \
        case TYPECH_VAL: \
          { \
            TYPE val = py::cast<TYPE>(obj); \
            PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index, sizeof(val), &val)); \
            break; \
          }
        {
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('c', char)
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('b', signed char)
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('B', unsigned char)
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('h', short)
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('H', unsigned short)
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('i', int)
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('I', unsigned int)
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('l', long)
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('L', unsigned long)
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('f', float)
          PYOPENCL_KERNEL_PACK_AND_SET_ARG('d', double)
          default:
            throw error("Kernel.set_arg_buf_pack", CL_INVALID_VALUE,
                "invalid type char");
        }
#undef PYOPENCL_KERNEL_PACK_AND_SET_ARG
      }

      void set_arg_buf(cl_uint arg_index, py::handle py_buffer)
      {
        const void *buf;
        PYOPENCL_BUFFER_SIZE_T len;

        py_buffer_wrapper buf_wrapper;

        try
        {
          buf_wrapper.get(py_buffer.ptr(), PyBUF_ANY_CONTIGUOUS);
        }
        catch (py::error_already_set &)
        {
          PyErr_Clear();
          throw error("Kernel.set_arg", CL_INVALID_VALUE,
              "invalid kernel argument");
        }

        buf = buf_wrapper.m_buf.buf;
        len = buf_wrapper.m_buf.len;

        PYOPENCL_CALL_GUARDED(clSetKernelArg,
            (m_kernel, arg_index, len, buf));
      }

Andreas Klöckner's avatar
Andreas Klöckner committed
#if PYOPENCL_CL_VERSION >= 0x2000
      void set_arg_svm(cl_uint arg_index, svm_arg_wrapper const &wrp)
      {
        PYOPENCL_CALL_GUARDED(clSetKernelArgSVMPointer,
            (m_kernel, arg_index, wrp.ptr()));
      }
#endif

      void set_arg(cl_uint arg_index, py::handle arg)
      {
        if (arg.ptr() == Py_None)
        {
          set_arg_null(arg_index);
          return;
        }

          set_arg_mem(arg_index, arg.cast<memory_object_holder &>());
        catch (py::cast_error &) { }
Andreas Klöckner's avatar
Andreas Klöckner committed
#if PYOPENCL_CL_VERSION >= 0x2000
        try
        {
          set_arg_svm(arg_index, arg.cast<svm_arg_wrapper const &>());
          return;
        }
        catch (py::cast_error &) { }
#endif

          set_arg_local(arg_index, arg.cast<local_memory>());
        catch (py::cast_error &) { }
          set_arg_sampler(arg_index, arg.cast<const sampler &>());
        catch (py::cast_error &) { }
        try
        {
          set_arg_command_queue(arg_index, arg.cast<const command_queue &>());
          return;
        }
        catch (py::cast_error &) { }

        set_arg_buf(arg_index, arg);
      }

      py::object get_info(cl_kernel_info param_name) const
      {
        switch (param_name)
        {
          case CL_KERNEL_FUNCTION_NAME:
            PYOPENCL_GET_STR_INFO(Kernel, m_kernel, param_name);
          case CL_KERNEL_NUM_ARGS:
          case CL_KERNEL_REFERENCE_COUNT:
            PYOPENCL_GET_TYPED_INFO(Kernel, m_kernel, param_name,
                cl_uint);
          case CL_KERNEL_CONTEXT:
            PYOPENCL_GET_OPAQUE_INFO(Kernel, m_kernel, param_name,
                cl_context, context);
          case CL_KERNEL_PROGRAM:
            PYOPENCL_GET_OPAQUE_INFO(Kernel, m_kernel, param_name,
                cl_program, program);
#if PYOPENCL_CL_VERSION >= 0x1020
          case CL_KERNEL_ATTRIBUTES:
            PYOPENCL_GET_STR_INFO(Kernel, m_kernel, param_name);
#endif
          default:
            throw error("Kernel.get_info", CL_INVALID_VALUE);
        }
      }

      py::object get_work_group_info(
          cl_kernel_work_group_info param_name,
          device const &dev
          ) const
      {
        switch (param_name)
        {
#define PYOPENCL_FIRST_ARG m_kernel, dev.data() // hackety hack
          case CL_KERNEL_WORK_GROUP_SIZE:
            PYOPENCL_GET_TYPED_INFO(KernelWorkGroup,
                PYOPENCL_FIRST_ARG, param_name,
                size_t);
          case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
            {
              std::vector<size_t> result;
              PYOPENCL_GET_VEC_INFO(KernelWorkGroup,
                  PYOPENCL_FIRST_ARG, param_name, result);

              PYOPENCL_RETURN_VECTOR(size_t, result);
            }
          case CL_KERNEL_LOCAL_MEM_SIZE:
#if PYOPENCL_CL_VERSION >= 0x1010
          case CL_KERNEL_PRIVATE_MEM_SIZE:
#endif
            PYOPENCL_GET_TYPED_INFO(KernelWorkGroup,
                PYOPENCL_FIRST_ARG, param_name,
                cl_ulong);

#if PYOPENCL_CL_VERSION >= 0x1010
          case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
            PYOPENCL_GET_TYPED_INFO(KernelWorkGroup,
                PYOPENCL_FIRST_ARG, param_name,
                size_t);
#endif
          default:
            throw error("Kernel.get_work_group_info", CL_INVALID_VALUE);
#undef PYOPENCL_FIRST_ARG
        }
      }

#if PYOPENCL_CL_VERSION >= 0x1020
      py::object get_arg_info(
          cl_uint arg_index,
          cl_kernel_arg_info param_name
          ) const
      {
        switch (param_name)
        {
#define PYOPENCL_FIRST_ARG m_kernel, arg_index // hackety hack
          case CL_KERNEL_ARG_ADDRESS_QUALIFIER:
            PYOPENCL_GET_TYPED_INFO(KernelArg,
                PYOPENCL_FIRST_ARG, param_name,
                cl_kernel_arg_address_qualifier);

          case CL_KERNEL_ARG_ACCESS_QUALIFIER:
            PYOPENCL_GET_TYPED_INFO(KernelArg,
                PYOPENCL_FIRST_ARG, param_name,
                cl_kernel_arg_access_qualifier);

          case CL_KERNEL_ARG_TYPE_NAME:
          case CL_KERNEL_ARG_NAME:
            PYOPENCL_GET_STR_INFO(KernelArg, PYOPENCL_FIRST_ARG, param_name);

          case CL_KERNEL_ARG_TYPE_QUALIFIER:
            PYOPENCL_GET_TYPED_INFO(KernelArg,
                PYOPENCL_FIRST_ARG, param_name,
                cl_kernel_arg_type_qualifier);
#undef PYOPENCL_FIRST_ARG
          default:
            throw error("Kernel.get_arg_info", CL_INVALID_VALUE);
        }
      }
#endif

#if PYOPENCL_CL_VERSION >= 0x2010
    py::object get_sub_group_info(
        device const &dev,
        cl_kernel_sub_group_info param_name,
        py::object py_input_value)
    {
      switch (param_name)
      {
        // size_t * -> size_t
        case CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE:
        case CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE:
          {
            std::vector<size_t> input_value;
            COPY_PY_LIST(size_t, input_value);

            size_t param_value;
            PYOPENCL_CALL_GUARDED(clGetKernelSubGroupInfo,
                (m_kernel, dev.data(), param_name,
                 input_value.size()*sizeof(input_value.front()),
                 input_value.empty() ? nullptr : &input_value.front(),
                 sizeof(param_value), &param_value, 0));

            return py::cast(param_value);
          }

        // size_t -> size_t[]
        case CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT:
          {
            size_t input_value = py::cast<size_t>(py_input_value);

            std::vector<size_t> result;
            size_t size;
            PYOPENCL_CALL_GUARDED(clGetKernelSubGroupInfo,
                (m_kernel, dev.data(), param_name,
                 sizeof(input_value), &input_value,
                 0, nullptr, &size));
            result.resize(size / sizeof(result.front()));
            PYOPENCL_CALL_GUARDED(clGetKernelSubGroupInfo,
                (m_kernel, dev.data(), param_name,
                 sizeof(input_value), &input_value,
                 size, result.empty() ? nullptr : &result.front(), 0));

            PYOPENCL_RETURN_VECTOR(size_t, result);
          }

        // () -> size_t
        case CL_KERNEL_MAX_NUM_SUB_GROUPS:
        case CL_KERNEL_COMPILE_NUM_SUB_GROUPS:
          {
            size_t param_value;
            PYOPENCL_CALL_GUARDED(clGetKernelSubGroupInfo,
                (m_kernel, dev.data(), param_name,
                 0, nullptr,
                 sizeof(param_value), &param_value, 0));

            return py::cast(param_value);
          }

        default:
          throw error("Kernel.get_sub_group_info", CL_INVALID_VALUE);
      }
  }
#endif
#define PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER \
    catch (error &err) \
    { \
      std::string msg( \
          std::string("when processing arg#") + std::to_string(arg_index+1) \
          + std::string(" (1-based): ") + std::string(err.what())); \
      auto mod_cl_ary(py::module::import("pyopencl.array")); \
      auto cls_array(mod_cl_ary.attr("Array")); \
      if (arg_value.ptr() && py::isinstance(arg_value, cls_array)) \
        msg.append( \
            " (perhaps you meant to pass 'array.data' instead of the array itself?)"); \
      throw error(err.routine().c_str(), err.code(), msg.c_str()); \
    } \
    catch (std::exception &err) \
    { \
      std::string msg( \
          std::string("when processing arg#") + std::to_string(arg_index+1) \
          + std::string(" (1-based): ") + std::string(err.what())); \
      throw std::runtime_error(msg.c_str()); \
    }

  inline
  void set_arg_multi(
      std::function<void(cl_uint, py::handle)> set_arg_func,
      py::tuple args_and_indices)
  {
    cl_uint arg_index;
    py::handle arg_value;

    auto it = args_and_indices.begin(), end = args_and_indices.end();
    try
    {
      /* This is an internal interface that assumes it gets fed well-formed
       * data.  No meaningful error checking is being performed on
       * off-interval exhaustion of the iterator, on purpose.
       */
      while (it != end)
      {
        // special value in case integer cast fails
        arg_index = 9999 - 1;

        arg_index = py::cast<cl_uint>(*it++);
        arg_value = *it++;
        set_arg_func(arg_index, arg_value);
      }
    }
    PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER
  }


  inline
  void set_arg_multi(
      std::function<void(cl_uint, py::handle, py::handle)> set_arg_func,
      py::tuple args_and_indices)
  {
    cl_uint arg_index;
    py::handle arg_descr, arg_value;

    auto it = args_and_indices.begin(), end = args_and_indices.end();
    try
    {
      /* This is an internal interface that assumes it gets fed well-formed
       * data.  No meaningful error checking is being performed on
       * off-interval exhaustion of the iterator, on purpose.
       */
      while (it != end)
      {
        // special value in case integer cast fails
        arg_index = 9999 - 1;

        arg_index = py::cast<cl_uint>(*it++);
        arg_descr = *it++;
        arg_value = *it++;
        set_arg_func(arg_index, arg_descr, arg_value);
      }
    }
    PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER
  }


  inline
  py::list create_kernels_in_program(program &pgm)
  {
    cl_uint num_kernels;
    PYOPENCL_CALL_GUARDED(clCreateKernelsInProgram, (
          pgm.data(), 0, 0, &num_kernels));

    std::vector<cl_kernel> kernels(num_kernels);
    PYOPENCL_CALL_GUARDED(clCreateKernelsInProgram, (
          pgm.data(), num_kernels,
Andreas Klöckner's avatar
Andreas Klöckner committed
          kernels.empty( ) ? nullptr : &kernels.front(), &num_kernels));
    for (cl_kernel knl: kernels)
      result.append(handle_from_new_ptr(new kernel(knl, true)));

    return result;
  }

#define MAX_WS_DIM_COUNT 10

  inline
  event *enqueue_nd_range_kernel(
      command_queue &cq,
      kernel &knl,
      py::handle py_global_work_size,
      py::handle py_local_work_size,
      py::handle py_global_work_offset,
      py::handle py_wait_for,
      bool g_times_l,
      bool allow_empty_ndrange)
  {
    PYOPENCL_PARSE_WAIT_FOR;

    std::array<size_t, MAX_WS_DIM_COUNT> global_work_size;
    unsigned gws_size = 0;
    COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_size, gws_size);
    cl_uint work_dim = gws_size;

    std::array<size_t, MAX_WS_DIM_COUNT> local_work_size;
    size_t *local_work_size_ptr = nullptr;

    if (py_local_work_size.ptr() != Py_None)
    {
      COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, local_work_size, lws_size);

      if (g_times_l)
        work_dim = std::max(work_dim, lws_size);
          throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE,
              "global/local work sizes have differing dimensions");

      while (lws_size < work_dim)
        local_work_size[lws_size++] = 1;
      while (gws_size < work_dim)
        global_work_size[gws_size++] = 1;
      local_work_size_ptr = &local_work_size.front();
    if (g_times_l && lws_size)
    {
      for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
        global_work_size[work_axis] *= local_work_size[work_axis];
    }

    size_t *global_work_offset_ptr = nullptr;
    std::array<size_t, MAX_WS_DIM_COUNT> global_work_offset;
    if (py_global_work_offset.ptr() != Py_None)
    {
      unsigned gwo_size = 0;
      COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_offset, gwo_size);

      if (work_dim != gwo_size)
        throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE,
            "global work size and offset have differing dimensions");

      if (g_times_l && local_work_size_ptr)
      {
        for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
          global_work_offset[work_axis] *= local_work_size[work_axis];
      }

      global_work_offset_ptr = &global_work_offset.front();
    if (allow_empty_ndrange)
    {
#if PYOPENCL_CL_VERSION >= 0x1020
      bool is_empty = false;
      for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
        if (global_work_size[work_axis] == 0)
          is_empty = true;
      if (local_work_size_ptr)
        for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
          if (local_work_size_ptr[work_axis] == 0)
            is_empty = true;

      if (is_empty)
      {