diff --git a/sumpy/tools.py b/sumpy/tools.py
index 6b92c125df84917458239e32913a8e5cb8c8ff12..13acbbab5180301bc9d63f5703020f3f7f0a4b68 100644
--- a/sumpy/tools.py
+++ b/sumpy/tools.py
@@ -973,7 +973,17 @@ def run_opencl_fft(
         import pyopencl as cl
         import pyopencl.array as cla
 
-        start_evt = cl.enqueue_marker(queue, wait_for=wait_for[:])
+        if queue.device.platform.name == "NVIDIA CUDA":
+            # NVIDIA OpenCL gives wrong event profile values with wait_for
+            # Not passing wait_for will wait for all events queued before
+            # and therefore correctness is preserved if it's the same queue
+            for evt in wait_for:
+                if not evt.command_queue != queue:
+                    raise RuntimeError(
+                        "Different queues not supported with NVIDIA CUDA")
+            start_evt = cl.enqueue_marker(queue)
+        else:
+            start_evt = cl.enqueue_marker(queue, wait_for=wait_for[:])
 
         if app.inplace:
             raise RuntimeError("inplace fft is not supported")
@@ -991,7 +1001,11 @@ def run_opencl_fft(
         meth(app.app, int(input_vec.data.int_ptr),
             int(output_vec.data.int_ptr), int(queue.int_ptr))
 
-        end_evt = cl.enqueue_marker(queue, wait_for=[start_evt])
+        if queue.device.platform.name == "NVIDIA CUDA":
+            end_evt = cl.enqueue_marker(queue)
+        else:
+            end_evt = cl.enqueue_marker(queue, wait_for=[start_evt])
+
         output_vec.add_event(end_evt)
 
         return (MarkerBasedProfilingEvent(end_event=end_evt, start_event=start_evt),