diff --git a/arraycontext/impl/pytato/__init__.py b/arraycontext/impl/pytato/__init__.py index 98a310d086b45bbc87db4cb4539a0c23149c291e..6b9ac6b7ee2f0a8a83506ddbd02c0030552494cd 100644 --- a/arraycontext/impl/pytato/__init__.py +++ b/arraycontext/impl/pytato/__init__.py @@ -387,7 +387,27 @@ class PytatoPyOpenCLArrayContext(_BasePytatoArrayContext): self.using_svm and dev.type & cl.device_type.GPU and cl_char.has_coarse_grain_buffer_svm(dev))): - limit = dev.max_parameter_size + if dev.max_parameter_size == 4352: + # Nvidia devices and PTXAS declare a limit of 4352 bytes, + # which is incorrect. The CUDA documentation at + # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#function-parameters + # mentions a limit of 4KB, which is also incorrect. + # As far as I can tell, the actual limit is around 4080 + # bytes, at least on a K40. Reducing the limit further + # in order to be on the safe side. + + # Note that the naming convention isn't super consistent + # for Nvidia GPUs, so that we only use the maximum + # parameter size to determine if it is an Nvidia GPU. + + limit = 4096-200 + + from warnings import warn + warn("Running on an Nvidia GPU, reducing the argument " + f"size limit from 4352 to {limit}.") + else: + limit = dev.max_parameter_size + if self._force_svm_arg_limit is not None: limit = self._force_svm_arg_limit