From 9d1d61160635a3c4eb99335704d4c04b88182e78 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Mon, 9 Oct 2023 11:46:27 -0500 Subject: [PATCH] Fix enqueue_marker for NVIDIA CUDA (#173) * Fix enqueue_marker for NVIDIA CUDA * update comment * check for same queue * Fix bad merge --- sumpy/tools.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/sumpy/tools.py b/sumpy/tools.py index 6b92c125..13acbbab 100644 --- a/sumpy/tools.py +++ b/sumpy/tools.py @@ -973,7 +973,17 @@ def run_opencl_fft( import pyopencl as cl import pyopencl.array as cla - start_evt = cl.enqueue_marker(queue, wait_for=wait_for[:]) + if queue.device.platform.name == "NVIDIA CUDA": + # NVIDIA OpenCL gives wrong event profile values with wait_for + # Not passing wait_for will wait for all events queued before + # and therefore correctness is preserved if it's the same queue + for evt in wait_for: + if not evt.command_queue != queue: + raise RuntimeError( + "Different queues not supported with NVIDIA CUDA") + start_evt = cl.enqueue_marker(queue) + else: + start_evt = cl.enqueue_marker(queue, wait_for=wait_for[:]) if app.inplace: raise RuntimeError("inplace fft is not supported") @@ -991,7 +1001,11 @@ def run_opencl_fft( meth(app.app, int(input_vec.data.int_ptr), int(output_vec.data.int_ptr), int(queue.int_ptr)) - end_evt = cl.enqueue_marker(queue, wait_for=[start_evt]) + if queue.device.platform.name == "NVIDIA CUDA": + end_evt = cl.enqueue_marker(queue) + else: + end_evt = cl.enqueue_marker(queue, wait_for=[start_evt]) + output_vec.add_event(end_evt) return (MarkerBasedProfilingEvent(end_event=end_evt, start_event=start_evt), -- GitLab