diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d41702c9002b4d649501134dc7a47d8f4a26a90d..e7520a8c116bc17839194160c97b4bafd44126a4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -55,6 +55,7 @@ jobs: run: | CONDA_ENVIRONMENT=.test-conda-env-py3.yml curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh + ./configure.py --cl-use-shipped-ext . ./build-and-test-py-project-within-miniconda.sh pytest_mac: @@ -84,6 +85,7 @@ jobs: CONDA_ENVIRONMENT=.test-conda-env-py3.yml curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh . ci-support.sh + ./configure.py --cl-use-shipped-ext build_py_project_in_conda_env build_docs diff --git a/doc/make_constants.py b/doc/make_constants.py index cca0b7ce459c7dd091c98ea1bf64e6a51835286f..803a52d04a4a5c9d86d95f11bea7a100a9dd12ad 100644 --- a/doc/make_constants.py +++ b/doc/make_constants.py @@ -203,6 +203,10 @@ const_ext_lookup = { "PIPE_SUPPORT": cl_30, }, + cl.device_topology_type_amd: { + "PCIE": amd_devattr, + }, + cl.mem_object_type: { "IMAGE2D_ARRAY": cl_12, "IMAGE1D": cl_12, diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py index 354b942bf90071c4b0a79310fd3724245cf24266..1fa14a7bba43b5d29b6a7e8c04178da40e24627e 100644 --- a/pyopencl/__init__.py +++ b/pyopencl/__init__.py @@ -51,13 +51,14 @@ import sys _PYPY = "__pypy__" in sys.builtin_module_names -from pyopencl._cl import ( # noqa +from pyopencl._cl import ( # noqa: F401 get_cl_header_version, program_kind, status_code, platform_info, device_type, device_info, + device_topology_type_amd, device_fp_config, device_mem_cache_type, device_local_mem_type, @@ -153,9 +154,13 @@ from pyopencl._cl import ( # noqa Image, Sampler, - DeviceTopologyAmd, ) +try: + from pyopencl._cl import DeviceTopologyAmd # noqa: F401 +except ImportError: + pass + if not _PYPY: # FIXME: Add back to default set when pypy support catches up from pyopencl._cl import ( # noqa @@ -1121,10 +1126,8 @@ def _add_functionality(): """ svmallocation_old_init(self, ctx, size, alignment, flags) - read_write = ( - flags & mem_flags.WRITE_ONLY != 0 - or flags & mem_flags.READ_WRITE != 0) - + # mem_flags.READ_ONLY applies to kernels, not the host + read_write = True _interface["data"] = ( int(self._ptr_as_int()), not read_write) diff --git a/pyopencl/array.py b/pyopencl/array.py index d3d353d1bca6b70bc6e42127f8ab0da770ded2f4..bf58c965c69d69820a405de7a90eb7cd1e74bc87 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -50,6 +50,25 @@ def _get_common_dtype(obj1, obj2, queue): has_double_support(queue.device)) +def _get_truedivide_dtype(obj1, obj2, queue): + # the dtype of the division result obj1 / obj2 + + allow_double = has_double_support(queue.device) + + x1 = obj1 if np.isscalar(obj1) else np.ones(1, obj1.dtype) + x2 = obj2 if np.isscalar(obj2) else np.ones(1, obj2.dtype) + + result = (x1/x2).dtype + + if not allow_double: + if result == np.float64: + result = np.dtype(np.float32) + elif result == np.complex128: + result = np.dtype(np.complex64) + + return result + + # Work around PyPy not currently supporting the object dtype. # (Yes, it doesn't even support checking!) # (as of May 27, 2014 on PyPy 2.3) @@ -63,6 +82,10 @@ except Exception: return False +class InconsistentOpenCLQueueWarning(UserWarning): + pass + + class VecLookupWarner: def __getattr__(self, name): from warnings import warn @@ -125,7 +148,11 @@ def elwise_kernel_runner(kernel_getter): def kernel_runner(*args, **kwargs): repr_ary = args[0] - queue = kwargs.pop("queue", None) or repr_ary.queue + queue = kwargs.pop("queue", None) + implicit_queue = queue is None + if implicit_queue: + queue = repr_ary.queue + wait_for = kwargs.pop("wait_for", None) # wait_for must be a copy, because we modify it in-place below @@ -152,6 +179,16 @@ def elwise_kernel_runner(kernel_getter): actual_args.append(arg.base_data) actual_args.append(arg.offset) wait_for.extend(arg.events) + + if (implicit_queue + and arg.queue is not None + and arg.queue != queue): + from warnings import warn + + warn("Implicit queue in elementwise operation does not match " + "queue of a provided argument. This will become an " + "error in 2021.", + type=InconsistentOpenCLQueueWarning) else: actual_args.append(arg) actual_args.append(repr_ary.size) @@ -989,7 +1026,7 @@ class Array: result.add_event( self._axpbyz(result, self.dtype.type(1), self, - other.dtype.type(-1), other)) + result.dtype.type(-1), other)) return result else: @@ -1012,7 +1049,7 @@ class Array: # other must be a scalar result = self._new_like_me(common_dtype) result.add_event( - self._axpbz(result, self.dtype.type(-1), self, + self._axpbz(result, result.dtype.type(-1), self, common_dtype.type(other))) return result @@ -1080,20 +1117,20 @@ class Array: def __div__(self, other): """Divides an array by an array or a scalar, i.e. ``self / other``. """ + common_dtype = _get_truedivide_dtype(self, other, self.queue) if isinstance(other, Array): - result = self._new_like_me( - _get_common_dtype(self, other, self.queue)) + result = self._new_like_me(common_dtype) result.add_event(self._div(result, self, other)) else: if other == 1: return self.copy() else: # create a new array for the result - common_dtype = _get_common_dtype(self, other, self.queue) result = self._new_like_me(common_dtype) result.add_event( self._axpbz(result, - common_dtype.type(1/other), self, self.dtype.type(0))) + np.true_divide(common_dtype.type(1), other), + self, self.dtype.type(0))) return result @@ -1102,14 +1139,13 @@ class Array: def __rdiv__(self, other): """Divides an array by a scalar or an array, i.e. ``other / self``. """ + common_dtype = _get_truedivide_dtype(self, other, self.queue) if isinstance(other, Array): - result = self._new_like_me( - _get_common_dtype(self, other, self.queue)) + result = self._new_like_me(common_dtype) result.add_event(other._div(result, self)) else: # create a new array for the result - common_dtype = _get_common_dtype(self, other, self.queue) result = self._new_like_me(common_dtype) result.add_event( self._rdiv_scalar(result, self, common_dtype.type(other))) @@ -1118,6 +1154,26 @@ class Array: __rtruediv__ = __rdiv__ + def __itruediv__(self, other): + # raise an error if the result cannot be cast to self + common_dtype = _get_truedivide_dtype(self, other, self.queue) + if not np.can_cast(common_dtype, self.dtype.type): + raise TypeError("Cannot cast {!r} to {!r}" + .format(self.dtype, common_dtype)) + + if isinstance(other, Array): + self.add_event( + self._div(self, self, other)) + else: + if other == 1: + return self + else: + self.add_event( + self._axpbz(self, common_dtype.type(np.true_divide(1, other)), + self, self.dtype.type(0))) + + return self + def __and__(self, other): common_dtype = _get_common_dtype(self, other, self.queue) diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py index 357aa2bbf17477713905d040376ec199a518f877..df364eda3c883d378c1e9d25136d8f59f5763f9d 100644 --- a/pyopencl/elementwise.py +++ b/pyopencl/elementwise.py @@ -503,36 +503,36 @@ def real_dtype(dtype): @context_dependent_memoize def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z): - ax = "a*x[i]" - by = "b*y[i]" + result_t = dtype_to_ctype(dtype_z) x_is_complex = dtype_x.kind == "c" y_is_complex = dtype_y.kind == "c" - if x_is_complex: - ax = "%s_mul(a, x[i])" % complex_dtype_to_name(dtype_x) - - if y_is_complex: - by = "%s_mul(b, y[i])" % complex_dtype_to_name(dtype_y) + if dtype_z.kind == "c": + # a and b will always be complex here. + z_ct = complex_dtype_to_name(dtype_z) - if x_is_complex and not y_is_complex: - by = "{}_fromreal({})".format(complex_dtype_to_name(dtype_x), by) + if x_is_complex: + ax = f"{z_ct}_mul(a, {z_ct}_cast(x[i]))" + else: + ax = f"{z_ct}_mulr(a, x[i])" - if not x_is_complex and y_is_complex: - ax = "{}_fromreal({})".format(complex_dtype_to_name(dtype_y), ax) + if y_is_complex: + by = f"{z_ct}_mul(b, {z_ct}_cast(y[i]))" + else: + by = f"{z_ct}_mulr(b, y[i])" - if x_is_complex or y_is_complex: - result = ( - "{root}_add({root}_cast({ax}), {root}_cast({by}))" - .format( - ax=ax, - by=by, - root=complex_dtype_to_name(dtype_z))) + result = f"{z_ct}_add({ax}, {by})" else: + # real-only + + ax = f"a*(({result_t}) x[i])" + by = f"b*(({result_t}) y[i])" + result = f"{ax} + {by}" return get_elwise_kernel(context, - "{tp_z} *z, {tp_x} a, {tp_x} *x, {tp_y} b, {tp_y} *y".format( + "{tp_z} *z, {tp_z} a, {tp_x} *x, {tp_z} b, {tp_y} *y".format( tp_x=dtype_to_ctype(dtype_x), tp_y=dtype_to_ctype(dtype_y), tp_z=dtype_to_ctype(dtype_z), diff --git a/pyopencl/version.py b/pyopencl/version.py index 0668cdc6768acd4654d85574dc4fd69788891456..4d2090dc6f60cab9113cb2798d7bf57991284252 100644 --- a/pyopencl/version.py +++ b/pyopencl/version.py @@ -1,3 +1,3 @@ -VERSION = (2020, 2, 1) +VERSION = (2020, 2, 2) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000000000000000000000000000000000000..f2a2f6894081711b89214e24c18a5104f99db607 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +markers= + bitonic: tests involving bitonic sort diff --git a/src/clinfo_ext.h b/src/clinfo_ext.h index 43b7b6082fda28ad433f26c5d9a5e2e743e24940..dc4d6a8e6b9dd42b58b1fdf5dafeadd5fb2a4671 100644 --- a/src/clinfo_ext.h +++ b/src/clinfo_ext.h @@ -71,6 +71,10 @@ #define CL_DEVICE_GFXIP_MAJOR_AMD 0x404A #define CL_DEVICE_GFXIP_MINOR_AMD 0x404B #define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD 0x404C +#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD 0x4030 +#define CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD 0x4031 +#define CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD 0x4033 +#define CL_DEVICE_PCIE_ID_AMD 0x4034 #ifndef CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD #define CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD 1 diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp index 1c7482d78b1bc68b3dedc2add7367bf277e5111b..42ee4c11cdea48dfc11043eeb09a68f631d20ada 100644 --- a/src/wrap_cl.hpp +++ b/src/wrap_cl.hpp @@ -364,7 +364,7 @@ -#define PYOPENCL_GET_INTEGRAL_INFO(WHAT, FIRST_ARG, SECOND_ARG, TYPE) \ +#define PYOPENCL_GET_TYPED_INFO(WHAT, FIRST_ARG, SECOND_ARG, TYPE) \ { \ TYPE param_value; \ PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \ @@ -669,7 +669,7 @@ namespace pyopencl py::object get_info(cl_device_info param_name) const { #define DEV_GET_INT_INF(TYPE) \ - PYOPENCL_GET_INTEGRAL_INFO(Device, m_device, param_name, TYPE); + PYOPENCL_GET_TYPED_INFO(Device, m_device, param_name, TYPE); switch (param_name) { @@ -836,15 +836,15 @@ namespace pyopencl // {{{ AMD dev attrs cl_amd_device_attribute_query // // types of AMD dev attrs divined from -// https://www.khronos.org/registry/cl/api/1.2/cl.hpp +// https://github.com/KhronosGroup/OpenCL-CLHPP/blob/3b03738fef487378b188d21cc5f2bae276aa8721/include/CL/opencl.hpp#L1471-L1500 #ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD case CL_DEVICE_PROFILING_TIMER_OFFSET_AMD: DEV_GET_INT_INF(cl_ulong); #endif -/* FIXME #ifdef CL_DEVICE_TOPOLOGY_AMD case CL_DEVICE_TOPOLOGY_AMD: + PYOPENCL_GET_TYPED_INFO( + Device, m_device, param_name, cl_device_topology_amd); #endif -*/ #ifdef CL_DEVICE_BOARD_NAME_AMD case CL_DEVICE_BOARD_NAME_AMD: ; PYOPENCL_GET_STR_INFO(Device, m_device, param_name); @@ -875,6 +875,17 @@ namespace pyopencl #ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD case CL_DEVICE_LOCAL_MEM_BANKS_AMD: DEV_GET_INT_INF(cl_uint); #endif +// FIXME: MISSING: +// +// CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD +// CL_DEVICE_GFXIP_MAJOR_AMD +// CL_DEVICE_GFXIP_MINOR_AMD +// CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD +// CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD +// CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD +// CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD +// CL_DEVICE_PCIE_ID_AMD + // }}} #ifdef CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT @@ -1037,7 +1048,7 @@ namespace pyopencl switch (param_name) { case CL_CONTEXT_REFERENCE_COUNT: - PYOPENCL_GET_INTEGRAL_INFO( + PYOPENCL_GET_TYPED_INFO( Context, m_context, param_name, cl_uint); case CL_CONTEXT_DEVICES: @@ -1101,7 +1112,7 @@ namespace pyopencl #if PYOPENCL_CL_VERSION >= 0x1010 case CL_CONTEXT_NUM_DEVICES: - PYOPENCL_GET_INTEGRAL_INFO( + PYOPENCL_GET_TYPED_INFO( Context, m_context, param_name, cl_uint); #endif @@ -1438,10 +1449,10 @@ namespace pyopencl PYOPENCL_GET_OPAQUE_INFO(CommandQueue, m_queue, param_name, cl_device_id, device); case CL_QUEUE_REFERENCE_COUNT: - PYOPENCL_GET_INTEGRAL_INFO(CommandQueue, m_queue, param_name, + PYOPENCL_GET_TYPED_INFO(CommandQueue, m_queue, param_name, cl_uint); case CL_QUEUE_PROPERTIES: - PYOPENCL_GET_INTEGRAL_INFO(CommandQueue, m_queue, param_name, + PYOPENCL_GET_TYPED_INFO(CommandQueue, m_queue, param_name, cl_command_queue_properties); default: @@ -1517,13 +1528,13 @@ namespace pyopencl PYOPENCL_GET_OPAQUE_INFO(Event, m_event, param_name, cl_command_queue, command_queue); case CL_EVENT_COMMAND_TYPE: - PYOPENCL_GET_INTEGRAL_INFO(Event, m_event, param_name, + PYOPENCL_GET_TYPED_INFO(Event, m_event, param_name, cl_command_type); case CL_EVENT_COMMAND_EXECUTION_STATUS: - PYOPENCL_GET_INTEGRAL_INFO(Event, m_event, param_name, + PYOPENCL_GET_TYPED_INFO(Event, m_event, param_name, cl_int); case CL_EVENT_REFERENCE_COUNT: - PYOPENCL_GET_INTEGRAL_INFO(Event, m_event, param_name, + PYOPENCL_GET_TYPED_INFO(Event, m_event, param_name, cl_uint); #if PYOPENCL_CL_VERSION >= 0x1010 case CL_EVENT_CONTEXT: @@ -1547,7 +1558,7 @@ namespace pyopencl #if PYOPENCL_CL_VERSION >= 0x2000 case CL_PROFILING_COMMAND_COMPLETE: #endif - PYOPENCL_GET_INTEGRAL_INFO(EventProfiling, m_event, param_name, + PYOPENCL_GET_TYPED_INFO(EventProfiling, m_event, param_name, cl_ulong); default: throw error("Event.get_profiling_info", CL_INVALID_VALUE); @@ -2462,7 +2473,7 @@ namespace pyopencl switch (param_name) { case CL_IMAGE_FORMAT: - PYOPENCL_GET_INTEGRAL_INFO(Image, data(), param_name, + PYOPENCL_GET_TYPED_INFO(Image, data(), param_name, cl_image_format); case CL_IMAGE_ELEMENT_SIZE: case CL_IMAGE_ROW_PITCH: @@ -2473,7 +2484,7 @@ namespace pyopencl #if PYOPENCL_CL_VERSION >= 0x1020 case CL_IMAGE_ARRAY_SIZE: #endif - PYOPENCL_GET_INTEGRAL_INFO(Image, data(), param_name, size_t); + PYOPENCL_GET_TYPED_INFO(Image, data(), param_name, size_t); #if PYOPENCL_CL_VERSION >= 0x1020 case CL_IMAGE_BUFFER: @@ -2492,7 +2503,7 @@ namespace pyopencl case CL_IMAGE_NUM_MIP_LEVELS: case CL_IMAGE_NUM_SAMPLES: - PYOPENCL_GET_INTEGRAL_INFO(Image, data(), param_name, cl_uint); + PYOPENCL_GET_TYPED_INFO(Image, data(), param_name, cl_uint); #endif default: @@ -3566,19 +3577,19 @@ namespace pyopencl switch (param_name) { case CL_SAMPLER_REFERENCE_COUNT: - PYOPENCL_GET_INTEGRAL_INFO(Sampler, m_sampler, param_name, + PYOPENCL_GET_TYPED_INFO(Sampler, m_sampler, param_name, cl_uint); case CL_SAMPLER_CONTEXT: PYOPENCL_GET_OPAQUE_INFO(Sampler, m_sampler, param_name, cl_context, context); case CL_SAMPLER_ADDRESSING_MODE: - PYOPENCL_GET_INTEGRAL_INFO(Sampler, m_sampler, param_name, + PYOPENCL_GET_TYPED_INFO(Sampler, m_sampler, param_name, cl_addressing_mode); case CL_SAMPLER_FILTER_MODE: - PYOPENCL_GET_INTEGRAL_INFO(Sampler, m_sampler, param_name, + PYOPENCL_GET_TYPED_INFO(Sampler, m_sampler, param_name, cl_filter_mode); case CL_SAMPLER_NORMALIZED_COORDS: - PYOPENCL_GET_INTEGRAL_INFO(Sampler, m_sampler, param_name, + PYOPENCL_GET_TYPED_INFO(Sampler, m_sampler, param_name, cl_bool); default: @@ -3631,13 +3642,13 @@ namespace pyopencl switch (param_name) { case CL_PROGRAM_REFERENCE_COUNT: - PYOPENCL_GET_INTEGRAL_INFO(Program, m_program, param_name, + PYOPENCL_GET_TYPED_INFO(Program, m_program, param_name, cl_uint); case CL_PROGRAM_CONTEXT: PYOPENCL_GET_OPAQUE_INFO(Program, m_program, param_name, cl_context, context); case CL_PROGRAM_NUM_DEVICES: - PYOPENCL_GET_INTEGRAL_INFO(Program, m_program, param_name, + PYOPENCL_GET_TYPED_INFO(Program, m_program, param_name, cl_uint); case CL_PROGRAM_DEVICES: { @@ -3703,7 +3714,7 @@ namespace pyopencl // }}} #if PYOPENCL_CL_VERSION >= 0x1020 case CL_PROGRAM_NUM_KERNELS: - PYOPENCL_GET_INTEGRAL_INFO(Program, m_program, param_name, + PYOPENCL_GET_TYPED_INFO(Program, m_program, param_name, size_t); case CL_PROGRAM_KERNEL_NAMES: PYOPENCL_GET_STR_INFO(Program, m_program, param_name); @@ -3722,7 +3733,7 @@ namespace pyopencl { #define PYOPENCL_FIRST_ARG m_program, dev.data() // hackety hack case CL_PROGRAM_BUILD_STATUS: - PYOPENCL_GET_INTEGRAL_INFO(ProgramBuild, + PYOPENCL_GET_TYPED_INFO(ProgramBuild, PYOPENCL_FIRST_ARG, param_name, cl_build_status); case CL_PROGRAM_BUILD_OPTIONS: @@ -3731,13 +3742,13 @@ namespace pyopencl PYOPENCL_FIRST_ARG, param_name); #if PYOPENCL_CL_VERSION >= 0x1020 case CL_PROGRAM_BINARY_TYPE: - PYOPENCL_GET_INTEGRAL_INFO(ProgramBuild, + PYOPENCL_GET_TYPED_INFO(ProgramBuild, PYOPENCL_FIRST_ARG, param_name, cl_program_binary_type); #endif #if PYOPENCL_CL_VERSION >= 0x2000 case CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE: - PYOPENCL_GET_INTEGRAL_INFO(ProgramBuild, + PYOPENCL_GET_TYPED_INFO(ProgramBuild, PYOPENCL_FIRST_ARG, param_name, size_t); #endif @@ -4189,7 +4200,7 @@ namespace pyopencl PYOPENCL_GET_STR_INFO(Kernel, m_kernel, param_name); case CL_KERNEL_NUM_ARGS: case CL_KERNEL_REFERENCE_COUNT: - PYOPENCL_GET_INTEGRAL_INFO(Kernel, m_kernel, param_name, + PYOPENCL_GET_TYPED_INFO(Kernel, m_kernel, param_name, cl_uint); case CL_KERNEL_CONTEXT: PYOPENCL_GET_OPAQUE_INFO(Kernel, m_kernel, param_name, @@ -4215,7 +4226,7 @@ namespace pyopencl { #define PYOPENCL_FIRST_ARG m_kernel, dev.data() // hackety hack case CL_KERNEL_WORK_GROUP_SIZE: - PYOPENCL_GET_INTEGRAL_INFO(KernelWorkGroup, + PYOPENCL_GET_TYPED_INFO(KernelWorkGroup, PYOPENCL_FIRST_ARG, param_name, size_t); case CL_KERNEL_COMPILE_WORK_GROUP_SIZE: @@ -4230,13 +4241,13 @@ namespace pyopencl #if PYOPENCL_CL_VERSION >= 0x1010 case CL_KERNEL_PRIVATE_MEM_SIZE: #endif - PYOPENCL_GET_INTEGRAL_INFO(KernelWorkGroup, + PYOPENCL_GET_TYPED_INFO(KernelWorkGroup, PYOPENCL_FIRST_ARG, param_name, cl_ulong); #if PYOPENCL_CL_VERSION >= 0x1010 case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: - PYOPENCL_GET_INTEGRAL_INFO(KernelWorkGroup, + PYOPENCL_GET_TYPED_INFO(KernelWorkGroup, PYOPENCL_FIRST_ARG, param_name, size_t); #endif @@ -4256,12 +4267,12 @@ namespace pyopencl { #define PYOPENCL_FIRST_ARG m_kernel, arg_index // hackety hack case CL_KERNEL_ARG_ADDRESS_QUALIFIER: - PYOPENCL_GET_INTEGRAL_INFO(KernelArg, + PYOPENCL_GET_TYPED_INFO(KernelArg, PYOPENCL_FIRST_ARG, param_name, cl_kernel_arg_address_qualifier); case CL_KERNEL_ARG_ACCESS_QUALIFIER: - PYOPENCL_GET_INTEGRAL_INFO(KernelArg, + PYOPENCL_GET_TYPED_INFO(KernelArg, PYOPENCL_FIRST_ARG, param_name, cl_kernel_arg_access_qualifier); @@ -4270,7 +4281,7 @@ namespace pyopencl PYOPENCL_GET_STR_INFO(KernelArg, PYOPENCL_FIRST_ARG, param_name); case CL_KERNEL_ARG_TYPE_QUALIFIER: - PYOPENCL_GET_INTEGRAL_INFO(KernelArg, + PYOPENCL_GET_TYPED_INFO(KernelArg, PYOPENCL_FIRST_ARG, param_name, cl_kernel_arg_type_qualifier); #undef PYOPENCL_FIRST_ARG @@ -4476,9 +4487,9 @@ namespace pyopencl switch (param_name) { case CL_GL_TEXTURE_TARGET: - PYOPENCL_GET_INTEGRAL_INFO(GLTexture, data(), param_name, GLenum); + PYOPENCL_GET_TYPED_INFO(GLTexture, data(), param_name, GLenum); case CL_GL_MIPMAP_LEVEL: - PYOPENCL_GET_INTEGRAL_INFO(GLTexture, data(), param_name, GLint); + PYOPENCL_GET_TYPED_INFO(GLTexture, data(), param_name, GLint); default: throw error("MemoryObject.get_gl_texture_info", CL_INVALID_VALUE); @@ -4729,22 +4740,22 @@ namespace pyopencl switch (param_name) { case CL_MEM_TYPE: - PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name, + PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name, cl_mem_object_type); case CL_MEM_FLAGS: - PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name, + PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name, cl_mem_flags); case CL_MEM_SIZE: - PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name, + PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name, size_t); case CL_MEM_HOST_PTR: throw pyopencl::error("MemoryObject.get_info", CL_INVALID_VALUE, "Use MemoryObject.get_host_array to get host pointer."); case CL_MEM_MAP_COUNT: - PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name, + PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name, cl_uint); case CL_MEM_REFERENCE_COUNT: - PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name, + PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name, cl_uint); case CL_MEM_CONTEXT: PYOPENCL_GET_OPAQUE_INFO(MemObject, data(), param_name, @@ -4765,7 +4776,7 @@ namespace pyopencl return create_mem_object_wrapper(param_value); } case CL_MEM_OFFSET: - PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name, + PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name, size_t); #endif diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp index cbd1f9a40f85ee0c71be5adfe38bc8cd1cd20e50..175b5aa5932a16ac0528bb7656778f179b490f6f 100644 --- a/src/wrap_cl_part_2.cpp +++ b/src/wrap_cl_part_2.cpp @@ -552,41 +552,6 @@ void pyopencl_expose_part_2(py::module &m) #endif // }}} - // {{{ CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD - - { - typedef cl_device_topology_amd cls; - py::class_<cls>(m, "DeviceTopologyAmd") - .def(py::init( - [](cl_char bus, cl_char device, cl_char function) - { - cl_device_topology_amd result; - result.pcie.bus = bus; - result.pcie.device = device; - result.pcie.function = function; - return result; - }), - py::arg("bus")=0, - py::arg("device")=0, - py::arg("function")=0) - - .def_property("type", - [](cls &t) { return t.pcie.type; }, - [](cls &t, cl_uint val) { t.pcie.type = val; }) - - .def_property("bus", - [](cls &t) { return t.pcie.bus; }, - [](cls &t, cl_char val) { t.pcie.bus = val; }) - .def_property("device", - [](cls &t) { return t.pcie.device; }, - [](cls &t, cl_char val) { t.pcie.device = val; }) - .def_property("function", - [](cls &t) { return t.pcie.function; }, - [](cls &t, cl_char val) { t.pcie.function = val; }) - ; - } - - // }}} } diff --git a/src/wrap_constants.cpp b/src/wrap_constants.cpp index 6fc5658fa24894acd182328825e49f61dce3b26c..5f3e4324e9bd84039d7f5a4e668f2cbf02ee9510 100644 --- a/src/wrap_constants.cpp +++ b/src/wrap_constants.cpp @@ -40,6 +40,7 @@ namespace class platform_info { }; class device_type { }; class device_info { }; + class device_topology_type_amd { }; class device_fp_config { }; class device_mem_cache_type { }; class device_local_mem_type { }; @@ -370,6 +371,7 @@ void pyopencl_expose_constants(py::module &m) ADD_ATTR(DEVICE_, PCI_SLOT_ID_NV); #endif #endif + // {{{ cl_amd_device_attribute_query #ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD ADD_ATTR(DEVICE_, PROFILING_TIMER_OFFSET_AMD); @@ -410,7 +412,6 @@ void pyopencl_expose_constants(py::module &m) #ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD ADD_ATTR(DEVICE_, LOCAL_MEM_BANKS_AMD); #endif -// }}} #ifdef CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD ADD_ATTR(DEVICE_, THREAD_TRACE_SUPPORTED_AMD); #endif @@ -423,6 +424,19 @@ void pyopencl_expose_constants(py::module &m) #ifdef CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD ADD_ATTR(DEVICE_, AVAILABLE_ASYNC_QUEUES_AMD); #endif +#ifdef CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD + ADD_ATTR(DEVICE_, PREFERRED_WORK_GROUP_SIZE_AMD); +#endif +#ifdef CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD + ADD_ATTR(DEVICE_, MAX_WORK_GROUP_SIZE_AMD); +#endif +#ifdef CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD + ADD_ATTR(DEVICE_, PREFERRED_CONSTANT_BUFFER_SIZE_AMD); +#endif +#ifdef CL_DEVICE_PCIE_ID_AMD + ADD_ATTR(DEVICE_, PCIE_ID_AMD); +#endif +// }}} #ifdef CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT ADD_ATTR(DEVICE_, MAX_ATOMIC_COUNTERS_EXT); @@ -515,6 +529,13 @@ void pyopencl_expose_constants(py::module &m) #endif } + { + py::class_<device_topology_type_amd> cls(m, "device_topology_type_amd"); +#ifdef CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD + cls.attr("PCIE") = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD; +#endif + } + { py::class_<device_fp_config> cls(m, "device_fp_config"); ADD_ATTR(FP_, DENORM); @@ -1105,6 +1126,45 @@ void pyopencl_expose_constants(py::module &m) #endif // }}} + + // {{{ CL_DEVICE_TOPOLOGY_AMD + +#ifdef CL_DEVICE_TOPOLOGY_AMD + { + typedef cl_device_topology_amd cls; + py::class_<cls>(m, "DeviceTopologyAmd") + .def(py::init( + [](cl_char bus, cl_char device, cl_char function) + { + cl_device_topology_amd result; + result.pcie.bus = bus; + result.pcie.device = device; + result.pcie.function = function; + return result; + }), + py::arg("bus")=0, + py::arg("device")=0, + py::arg("function")=0) + + .def_property("type", + [](cls &t) { return t.pcie.type; }, + [](cls &t, cl_uint val) { t.pcie.type = val; }) + + .def_property("bus", + [](cls &t) { return t.pcie.bus; }, + [](cls &t, cl_char val) { t.pcie.bus = val; }) + .def_property("device", + [](cls &t) { return t.pcie.device; }, + [](cls &t, cl_char val) { t.pcie.device = val; }) + .def_property("function", + [](cls &t) { return t.pcie.function; }, + [](cls &t, cl_char val) { t.pcie.function = val; }) + ; + } +#endif + + // }}} + } diff --git a/test/test_array.py b/test/test_array.py index 2cbef16c0d8bbd168ccb9a371f1904cb8cfc022a..d17772375a64f9d236568bef72005637e95d181a 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -426,12 +426,20 @@ def test_addition_scalar(ctx_factory): assert (7 + a == a_added).all() -def test_substract_array(ctx_factory): +@pytest.mark.parametrize(("dtype_a", "dtype_b"), + [ + (np.float32, np.float32), + (np.float32, np.int32), + (np.int32, np.int32), + (np.int64, np.int32), + (np.int64, np.uint32), + ]) +def test_subtract_array(ctx_factory, dtype_a, dtype_b): """Test the substraction of two arrays.""" #test data - a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) + a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(dtype_a) b = np.array([10, 20, 30, 40, 50, - 60, 70, 80, 90, 100]).astype(np.float32) + 60, 70, 80, 90, 100]).astype(dtype_b) context = ctx_factory() queue = cl.CommandQueue(context) @@ -471,14 +479,29 @@ def test_divide_scalar(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) - a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) - a_gpu = cl_array.to_device(queue, a) + dtypes = (np.uint8, np.uint16, np.uint32, + np.int8, np.int16, np.int32, + np.float32, np.complex64) + from pyopencl.characterize import has_double_support + if has_double_support(queue.device): + dtypes = dtypes + (np.float64, np.complex128) - result = (a_gpu / 2).get() - assert (a / 2 == result).all() + from itertools import product - result = (2 / a_gpu).get() - assert (np.abs(2 / a - result) < 1e-5).all() + for dtype_a, dtype_s in product(dtypes, repeat=2): + a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a) + s = dtype_s(40) + a_gpu = cl_array.to_device(queue, a) + + b = a / s + b_gpu = a_gpu / s + assert (np.abs(b_gpu.get() - b) < 1e-3).all() + assert b_gpu.dtype is b.dtype + + c = s / a + c_gpu = s / a_gpu + assert (np.abs(c_gpu.get() - c) < 1e-3).all() + assert c_gpu.dtype is c.dtype def test_divide_array(ctx_factory): @@ -487,18 +510,100 @@ def test_divide_array(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) - #test data - a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32) - b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(np.float32) + dtypes = (np.float32, np.complex64) + from pyopencl.characterize import has_double_support + if has_double_support(queue.device): + dtypes = dtypes + (np.float64, np.complex128) - a_gpu = cl_array.to_device(queue, a) - b_gpu = cl_array.to_device(queue, b) + from itertools import product + + for dtype_a, dtype_b in product(dtypes, repeat=2): + + a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a) + b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(dtype_b) + + a_gpu = cl_array.to_device(queue, a) + b_gpu = cl_array.to_device(queue, b) + c = a / b + c_gpu = (a_gpu / b_gpu) + assert (np.abs(c_gpu.get() - c) < 1e-3).all() + assert c_gpu.dtype is c.dtype + + d = b / a + d_gpu = (b_gpu / a_gpu) + assert (np.abs(d_gpu.get() - d) < 1e-3).all() + assert d_gpu.dtype is d.dtype + + +def test_divide_inplace_scalar(ctx_factory): + """Test inplace division of arrays and a scalar.""" - a_divide = (a_gpu / b_gpu).get() - assert (np.abs(a / b - a_divide) < 1e-3).all() + context = ctx_factory() + queue = cl.CommandQueue(context) + + dtypes = (np.uint8, np.uint16, np.uint32, + np.int8, np.int16, np.int32, + np.float32, np.complex64) + from pyopencl.characterize import has_double_support + if has_double_support(queue.device): + dtypes = dtypes + (np.float64, np.complex128) + + from itertools import product + + for dtype_a, dtype_s in product(dtypes, repeat=2): + + a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a) + s = dtype_s(40) + a_gpu = cl_array.to_device(queue, a) + + # ensure the same behavior as inplace numpy.ndarray division + try: + a /= s + except TypeError: + with np.testing.assert_raises(TypeError): + a_gpu /= s + else: + a_gpu /= s + assert (np.abs(a_gpu.get() - a) < 1e-3).all() + assert a_gpu.dtype is a.dtype + + +def test_divide_inplace_array(ctx_factory): + """Test inplace division of arrays.""" + + context = ctx_factory() + queue = cl.CommandQueue(context) + + dtypes = (np.uint8, np.uint16, np.uint32, + np.int8, np.int16, np.int32, + np.float32, np.complex64) + from pyopencl.characterize import has_double_support + if has_double_support(queue.device): + dtypes = dtypes + (np.float64, np.complex128) + + from itertools import product - a_divide = (b_gpu / a_gpu).get() - assert (np.abs(b / a - a_divide) < 1e-3).all() + for dtype_a, dtype_b in product(dtypes, repeat=2): + print(dtype_a, dtype_b) + a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a) + b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(dtype_b) + + a_gpu = cl_array.to_device(queue, a) + b_gpu = cl_array.to_device(queue, b) + + # ensure the same behavior as inplace numpy.ndarray division + try: + a_gpu /= b_gpu + except TypeError: + # pass for now, as numpy casts differently for in-place and out-place + # true_divide + pass + # with np.testing.assert_raises(TypeError): + # a /= b + else: + a /= b + assert (np.abs(a_gpu.get() - a) < 1e-3).all() + assert a_gpu.dtype is a.dtype def test_bitwise(ctx_factory): diff --git a/test/test_wrapper.py b/test/test_wrapper.py index 45e2d7476bf2939bcdee66c487c412667183e831..8aad416e75b9e3e346946cb2b5c5b40836c88ee4 100644 --- a/test/test_wrapper.py +++ b/test/test_wrapper.py @@ -1030,6 +1030,30 @@ def test_coarse_grain_svm(ctx_factory): cl.enqueue_copy(queue, new_ary, svm_ary) assert np.array_equal(orig_ary*2, new_ary) + # {{{ https://github.com/inducer/pyopencl/issues/372 + + buf_arr = cl.svm_empty(ctx, cl.svm_mem_flags.READ_ONLY, 10, np.int32) + out_arr = cl.svm_empty(ctx, cl.svm_mem_flags.READ_WRITE, 10, np.int32) + + svm_buf_arr = cl.SVM(buf_arr) + svm_out_arr = cl.SVM(out_arr) + with svm_buf_arr.map_rw(queue) as ary: + ary.fill(17) + + prg_ro = cl.Program(ctx, r""" + __kernel void twice_ro(__global int *out_g, __global int *in_g) + { + out_g[get_global_id(0)] = 2*in_g[get_global_id(0)]; + } + """).build() + + prg_ro.twice_ro(queue, buf_arr.shape, None, svm_out_arr, svm_buf_arr) + + with svm_out_arr.map_ro(queue) as ary: + print(ary) + + # }}} + def test_fine_grain_svm(ctx_factory): import sys