diff --git a/sumpy/tools.py b/sumpy/tools.py index 6b92c125df84917458239e32913a8e5cb8c8ff12..13acbbab5180301bc9d63f5703020f3f7f0a4b68 100644 --- a/sumpy/tools.py +++ b/sumpy/tools.py @@ -973,7 +973,17 @@ def run_opencl_fft( import pyopencl as cl import pyopencl.array as cla - start_evt = cl.enqueue_marker(queue, wait_for=wait_for[:]) + if queue.device.platform.name == "NVIDIA CUDA": + # NVIDIA OpenCL gives wrong event profile values with wait_for + # Not passing wait_for will wait for all events queued before + # and therefore correctness is preserved if it's the same queue + for evt in wait_for: + if not evt.command_queue != queue: + raise RuntimeError( + "Different queues not supported with NVIDIA CUDA") + start_evt = cl.enqueue_marker(queue) + else: + start_evt = cl.enqueue_marker(queue, wait_for=wait_for[:]) if app.inplace: raise RuntimeError("inplace fft is not supported") @@ -991,7 +1001,11 @@ def run_opencl_fft( meth(app.app, int(input_vec.data.int_ptr), int(output_vec.data.int_ptr), int(queue.int_ptr)) - end_evt = cl.enqueue_marker(queue, wait_for=[start_evt]) + if queue.device.platform.name == "NVIDIA CUDA": + end_evt = cl.enqueue_marker(queue) + else: + end_evt = cl.enqueue_marker(queue, wait_for=[start_evt]) + output_vec.add_event(end_evt) return (MarkerBasedProfilingEvent(end_event=end_evt, start_event=start_evt),