diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py index 0a965564c4f95d52714eb1b6e8279d7d2dfe69c8..a72f66b9f5a339482544e513f33204429f2988a4 100644 --- a/pyopencl/__init__.py +++ b/pyopencl/__init__.py @@ -819,7 +819,7 @@ def _add_functionality(): self.function_name, self.num_args, self.num_args, None, warn_about_arg_count_bug=None, - work_around_arg_count_bug=None) + work_around_arg_count_bug=None, devs=self.context.devices) self._wg_info_cache = {} return self @@ -858,7 +858,8 @@ def _add_functionality(): len(arg_types), self.num_args, arg_types, warn_about_arg_count_bug=warn_about_arg_count_bug, - work_around_arg_count_bug=work_around_arg_count_bug) + work_around_arg_count_bug=work_around_arg_count_bug, + devs=self.context.devices) # Make ourselves a kernel-specific class, so that we're able to override # __call__. Inspired by https://stackoverflow.com/a/38541437 @@ -880,7 +881,7 @@ def _add_functionality(): return result def kernel_set_args(self, *args, **kwargs): - # Need to dupicate the 'self' argument for dynamically generated method + # Need to duplicate the 'self' argument for dynamically generated method return self._set_args(self, *args, **kwargs) def kernel_call(self, queue, global_size, local_size, *args, **kwargs): diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py index 9383afdd59e3c9b0e587d9ea6594269c46e50a4a..bd1a402f71c7a97c2feb339be049b1bfde2ff796 100644 --- a/pyopencl/invoker.py +++ b/pyopencl/invoker.py @@ -29,6 +29,7 @@ import pyopencl._cl as _cl from pytools.persistent_dict import WriteOncePersistentDict from pytools.py_codegen import Indentation, PythonCodeGenerator from pyopencl.tools import _NumpyTypesKeyBuilder, VectorArg +import pyopencl as cl # {{{ arg packing helpers @@ -305,6 +306,71 @@ def _generate_enqueue_and_set_args_module(function_name, enqueue_name) +# {{{ Helper functions related to argument sizes and device limits + +def _get_max_parameter_size(dev): + """Return the device's maximum parameter size adjusted for pocl.""" + from pyopencl.characterize import get_pocl_version + + dev_limit = dev.max_parameter_size + + if get_pocl_version(dev.platform) is not None: + # Current pocl versions (as of 04/2022) have an incorrect parameter + # size limit of 1024; see e.g. https://github.com/pocl/pocl/pull/1046 + if dev_limit == 1024: + if dev.type & cl.device_type.CPU: + return 1024*1024 + if dev.type & cl.device_type.GPU: + # All modern Nvidia GPUs (starting from Compute Capability 2) + # have this limit + return 4352 + + return dev_limit + + +def _check_arg_size(function_name, num_cl_args, arg_types, devs): + """Check whether argument sizes exceed the OpenCL device limit.""" + + for dev in devs: + dev_ptr_size = int(dev.address_bits / 8) + dev_limit = _get_max_parameter_size(dev) + + total_arg_size = 0 + + is_estimate = False + + if arg_types: + for arg_type in arg_types: + if arg_type is None: + is_estimate = True + total_arg_size += dev_ptr_size + elif isinstance(arg_type, VectorArg): + total_arg_size += dev_ptr_size + else: + total_arg_size += np.dtype(arg_type).itemsize + else: + # Estimate that each argument has the size of a pointer on average + is_estimate = True + total_arg_size = dev_ptr_size * num_cl_args + + if total_arg_size > dev_limit: + from warnings import warn + warn(f"Kernel '{function_name}' has {num_cl_args} arguments with " + f"a total size of {total_arg_size} bytes, which is higher than " + f"the limit of {dev_limit} bytes on {dev}. This might " + "lead to compilation errors, especially on GPU devices.") + elif is_estimate and total_arg_size >= dev_limit * 0.75: + # Since total_arg_size is just an estimate, also warn in case we are + # just below the actual limit. + from warnings import warn + warn(f"Kernel '{function_name}' has {num_cl_args} arguments with " + f"a total size of {total_arg_size} bytes, which approaches " + f"the limit of {dev_limit} bytes on {dev}. This might " + "lead to compilation errors, especially on GPU devices.") + +# }}} + + invoker_cache = WriteOncePersistentDict( "pyopencl-invoker-cache-v41", key_builder=_NumpyTypesKeyBuilder()) @@ -313,7 +379,9 @@ invoker_cache = WriteOncePersistentDict( def generate_enqueue_and_set_args(function_name, num_passed_args, num_cl_args, arg_types, - work_around_arg_count_bug, warn_about_arg_count_bug): + work_around_arg_count_bug, warn_about_arg_count_bug, devs): + + _check_arg_size(function_name, num_cl_args, arg_types, devs) cache_key = (function_name, num_passed_args, num_cl_args, arg_types, __debug__,