diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index e37c40b503fa24ba47d8a4f1db3bbf64fe3747cb..0047755fcbc3ec26d86b3c24075354534c00ef13 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -166,6 +166,7 @@ Python 2.7 Apple:
 Python 3 Conda Apple:
   script:
   - CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+  - export CC=gcc
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
   - ". ./build-and-test-py-project-within-miniconda.sh"
   tags:
diff --git a/pyopencl/array.py b/pyopencl/array.py
index 2d03207962b5fb1c6f5c9c4edb48ad1d3478e231..704c495b4bff7ceed3d4808244a8bdb844f267b5 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -621,9 +621,10 @@ class Array(object):
                     stacklevel=2)
 
         if self.size:
-            cl.enqueue_copy(queue or self.queue, self.base_data, ary,
+            event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary,
                     device_offset=self.offset,
                     is_blocking=not async_)
+            self.add_event(event1)
 
     def get(self, queue=None, ary=None, async_=None, **kwargs):
         """Transfer the contents of *self* into *ary* or a newly allocated
@@ -687,7 +688,7 @@ class Array(object):
         if self.size:
             cl.enqueue_copy(queue, ary, self.base_data,
                     device_offset=self.offset,
-                    is_blocking=not async_)
+                    wait_for=self.events, is_blocking=not async_)
 
         return ary
 
@@ -712,9 +713,11 @@ class Array(object):
             result = result.with_queue(queue)
 
         if self.nbytes:
-            cl.enqueue_copy(queue or self.queue,
+            event1 = cl.enqueue_copy(queue or self.queue,
                     result.base_data, self.base_data,
-                    src_offset=self.offset, byte_count=self.nbytes)
+                    src_offset=self.offset, byte_count=self.nbytes,
+                    wait_for=self.events)
+            result.add_event(event1)
 
         return result
 
@@ -1287,12 +1290,22 @@ class Array(object):
     def any(self, queue=None, wait_for=None):
         from pyopencl.reduction import get_any_kernel
         krnl = get_any_kernel(self.context, self.dtype)
-        return krnl(self, queue=queue, wait_for=wait_for)
+        if wait_for is None:
+            wait_for = []
+        result, event1 = krnl(self, queue=queue,
+               wait_for=wait_for + self.events, return_event=True)
+        result.add_event(event1)
+        return result
 
     def all(self, queue=None, wait_for=None):
         from pyopencl.reduction import get_all_kernel
         krnl = get_all_kernel(self.context, self.dtype)
-        return krnl(self, queue=queue, wait_for=wait_for)
+        if wait_for is None:
+            wait_for = []
+        result, event1 = krnl(self, queue=queue,
+               wait_for=wait_for + self.events, return_event=True)
+        result.add_event(event1)
+        return result
 
     @staticmethod
     @elwise_kernel_runner
@@ -1673,11 +1686,13 @@ class Array(object):
 
         if flags is None:
             flags = cl.map_flags.READ | cl.map_flags.WRITE
+        if wait_for is None:
+            wait_for = []
 
         ary, evt = cl.enqueue_map_buffer(
                 queue or self.queue, self.base_data, flags, self.offset,
-                self.shape, self.dtype, strides=self.strides, wait_for=wait_for,
-                is_blocking=is_blocking)
+                self.shape, self.dtype, strides=self.strides,
+                wait_for=wait_for + self.events, is_blocking=is_blocking)
 
         if is_blocking:
             return ary
@@ -1796,6 +1811,9 @@ class Array(object):
         """
 
         queue = queue or self.queue or value.queue
+        if wait_for is None:
+            wait_for = []
+        wait_for = wait_for + self.events
 
         if isinstance(subscript, Array):
             if subscript.dtype.kind != "i":
@@ -2145,11 +2163,16 @@ def multi_take(arrays, indices, out=None, queue=None):
                     cl.kernel_work_group_info.WORK_GROUP_SIZE,
                     queue.device))
 
-        knl(queue, gs, ls,
+        wait_for_this = (indices.events
+            + _builtin_sum((i.events for i in arrays[chunk_slice]), [])
+            + _builtin_sum((o.events for o in out[chunk_slice]), []))
+        evt = knl(queue, gs, ls,
                 indices.data,
                 *([o.data for o in out[chunk_slice]]
                     + [i.data for i in arrays[chunk_slice]]
-                    + [indices.size]))
+                    + [indices.size]), wait_for=wait_for_this)
+        for o in out[chunk_slice]:
+            o.add_event(evt)
 
     return out
 
@@ -2219,7 +2242,10 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None,
                     queue.device))
 
         from pytools import flatten
-        knl(queue, gs, ls,
+        wait_for_this = (dest_indices.events + src_indices.events
+            + _builtin_sum((i.events for i in arrays[chunk_slice]), [])
+            + _builtin_sum((o.events for o in out[chunk_slice]), []))
+        evt = knl(queue, gs, ls,
                 *([o.data for o in out[chunk_slice]]
                     + [dest_indices.base_data,
                         dest_indices.offset,
@@ -2229,7 +2255,9 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None,
                         (i.base_data, i.offset)
                         for i in arrays[chunk_slice]))
                     + src_offsets_list[chunk_slice]
-                    + [src_indices.size]))
+                    + [src_indices.size]), wait_for=wait_for_this)
+        for o in out[chunk_slice]:
+            o.add_event(evt)
 
     return out
 
@@ -2244,6 +2272,9 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None,
     a_allocator = arrays[0].allocator
     context = dest_indices.context
     queue = queue or dest_indices.queue
+    if wait_for is None:
+        wait_for = []
+    wait_for = wait_for + dest_indices.events
 
     vec_count = len(arrays)
 
@@ -2295,6 +2326,9 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None,
                     queue.device))
 
         from pytools import flatten
+        wait_for_this = (wait_for
+            + _builtin_sum((i.events for i in arrays[chunk_slice]), [])
+            + _builtin_sum((o.events for o in out[chunk_slice]), []))
         evt = knl(queue, gs, ls,
                 *(
                     list(flatten(
@@ -2307,9 +2341,7 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None,
                     + [use_fill_cla.base_data, use_fill_cla.offset]
                     + [array_lengths_cla.base_data, array_lengths_cla.offset]
                     + [dest_indices.size]),
-                **dict(wait_for=wait_for))
-
-        # FIXME should wait on incoming events
+                **dict(wait_for=wait_for_this))
 
         for o in out[chunk_slice]:
             o.add_event(evt)
@@ -2387,7 +2419,8 @@ def diff(array, queue=None, allocator=None):
     allocator = allocator or array.allocator
 
     result = empty(queue, (n-1,), array.dtype, allocator=allocator)
-    _diff(result, array, queue=queue)
+    event1 = _diff(result, array, queue=queue)
+    result.add_event(event1)
     return result
 
 
@@ -2468,7 +2501,8 @@ def if_positive(criterion, then_, else_, out=None, queue=None):
 
     if out is None:
         out = empty_like(then_)
-    _if_positive(out, criterion, then_, else_, queue=queue)
+    event1 = _if_positive(out, criterion, then_, else_, queue=queue)
+    out.add_event(event1)
     return out
 
 
@@ -2501,7 +2535,10 @@ def sum(a, dtype=None, queue=None, slice=None):
     """
     from pyopencl.reduction import get_sum_kernel
     krnl = get_sum_kernel(a.context, dtype, a.dtype)
-    return krnl(a, queue=queue, slice=slice)
+    result, event1 = krnl(a, queue=queue, slice=slice, wait_for=a.events,
+            return_event=True)
+    result.add_event(event1)
+    return result
 
 
 def dot(a, b, dtype=None, queue=None, slice=None):
@@ -2510,7 +2547,10 @@ def dot(a, b, dtype=None, queue=None, slice=None):
     """
     from pyopencl.reduction import get_dot_kernel
     krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype)
-    return krnl(a, b, queue=queue, slice=slice)
+    result, event1 = krnl(a, b, queue=queue, slice=slice,
+            wait_for=a.events + b.events, return_event=True)
+    result.add_event(event1)
+    return result
 
 
 def vdot(a, b, dtype=None, queue=None, slice=None):
@@ -2521,7 +2561,10 @@ def vdot(a, b, dtype=None, queue=None, slice=None):
     from pyopencl.reduction import get_dot_kernel
     krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype,
             conjugate_first=True)
-    return krnl(a, b, queue=queue, slice=slice)
+    result, event1 = krnl(a, b, queue=queue, slice=slice,
+            wait_for=a.events + b.events, return_event=True)
+    result.add_event(event1)
+    return result
 
 
 def subset_dot(subset, a, b, dtype=None, queue=None, slice=None):
@@ -2531,14 +2574,20 @@ def subset_dot(subset, a, b, dtype=None, queue=None, slice=None):
     from pyopencl.reduction import get_subset_dot_kernel
     krnl = get_subset_dot_kernel(
             a.context, dtype, subset.dtype, a.dtype, b.dtype)
-    return krnl(subset, a, b, queue=queue, slice=slice)
+    result, event1 = krnl(subset, a, b, queue=queue, slice=slice,
+            wait_for=subset.events + a.events + b.events, return_event=True)
+    result.add_event(event1)
+    return result
 
 
 def _make_minmax_kernel(what):
     def f(a, queue=None):
         from pyopencl.reduction import get_minmax_kernel
         krnl = get_minmax_kernel(a.context, what, a.dtype)
-        return krnl(a,  queue=queue)
+        result, event1 = krnl(a, queue=queue, wait_for=a.events,
+                return_event=True)
+        result.add_event(event1)
+        return result
 
     return f
 
@@ -2558,8 +2607,10 @@ def _make_subset_minmax_kernel(what):
     def f(subset, a, queue=None, slice=None):
         from pyopencl.reduction import get_subset_minmax_kernel
         krnl = get_subset_minmax_kernel(a.context, what, a.dtype, subset.dtype)
-        return krnl(subset, a,  queue=queue, slice=slice)
-
+        result, event1 = krnl(subset, a,  queue=queue, slice=slice,
+                wait_for=a.events + subset.events, return_event=True)
+        result.add_event(event1)
+        return result
     return f
 
 
@@ -2583,12 +2634,15 @@ def cumsum(a, output_dtype=None, queue=None,
 
     if output_dtype is None:
         output_dtype = a.dtype
+    if wait_for is None:
+        wait_for = []
 
     result = a._new_like_me(output_dtype)
 
     from pyopencl.scan import get_cumsum_kernel
     krnl = get_cumsum_kernel(a.context, a.dtype, output_dtype)
-    evt = krnl(a, result, queue=queue, wait_for=wait_for)
+    evt = krnl(a, result, queue=queue, wait_for=wait_for + a.events)
+    result.add_event(evt)
 
     if return_event:
         return evt, result
diff --git a/pyopencl/clmath.py b/pyopencl/clmath.py
index 73d390cf850e9a054541209069c322e67adf402a..6d45fe44ceb02fa13c7c6fb4563045e3aecdd87c 100644
--- a/pyopencl/clmath.py
+++ b/pyopencl/clmath.py
@@ -41,7 +41,8 @@ def _make_unary_array_func(name):
 
     def f(array, queue=None):
         result = array._new_like_me(queue=queue)
-        knl_runner(result, array, queue=queue)
+        event1 = knl_runner(result, array, queue=queue)
+        result.add_event(event1)
         return result
 
     return f
@@ -78,7 +79,7 @@ def atan2(y, x, queue=None):
     """
     queue = queue or y.queue
     result = y._new_like_me(_get_common_dtype(y, x, queue))
-    _atan2(result, y, x, queue=queue)
+    result.add_event(_atan2(result, y, x, queue=queue))
     return result
 
 
@@ -92,7 +93,7 @@ def atan2pi(y, x, queue=None):
     """
     queue = queue or y.queue
     result = y._new_like_me(_get_common_dtype(y, x, queue))
-    _atan2pi(result, y, x, queue=queue)
+    result.add_event(_atan2pi(result, y, x, queue=queue))
     return result
 
 
@@ -130,7 +131,7 @@ def fmod(arg, mod, queue=None):
     for each element in `arg` and `mod`."""
     queue = (queue or arg.queue) or mod.queue
     result = arg._new_like_me(_get_common_dtype(arg, mod, queue))
-    _fmod(result, arg, mod, queue=queue)
+    result.add_event(_fmod(result, arg, mod, queue=queue))
     return result
 
 # TODO: fract
@@ -148,7 +149,9 @@ def frexp(arg, queue=None):
     """
     sig = arg._new_like_me(queue=queue)
     expt = arg._new_like_me(queue=queue, dtype=np.int32)
-    _frexp(sig, expt, arg, queue=queue)
+    event1 = _frexp(sig, expt, arg, queue=queue)
+    sig.add_event(event1)
+    expt.add_event(event1)
     return sig, expt
 
 # TODO: hypot
@@ -169,7 +172,7 @@ def ldexp(significand, exponent, queue=None):
     `result = significand * 2**exponent`.
     """
     result = significand._new_like_me(queue=queue)
-    _ldexp(result, significand, exponent)
+    result.add_event(_ldexp(result, significand, exponent))
     return result
 
 
@@ -199,7 +202,9 @@ def modf(arg, queue=None):
     """
     intpart = arg._new_like_me(queue=queue)
     fracpart = arg._new_like_me(queue=queue)
-    _modf(intpart, fracpart, arg, queue=queue)
+    event1 = _modf(intpart, fracpart, arg, queue=queue)
+    fracpart.add_event(event1)
+    intpart.add_event(event1)
     return fracpart, intpart
 
 
@@ -254,18 +259,20 @@ def _hankel_01(h0, h1, x):
 
 def bessel_jn(n, x, queue=None):
     result = x._new_like_me(queue=queue)
-    _bessel_jn(result, n, x, queue=queue)
+    result.add_event(_bessel_jn(result, n, x, queue=queue))
     return result
 
 
 def bessel_yn(n, x, queue=None):
     result = x._new_like_me(queue=queue)
-    _bessel_yn(result, n, x, queue=queue)
+    result.add_event(_bessel_yn(result, n, x, queue=queue))
     return result
 
 
 def hankel_01(x, queue=None):
     h0 = x._new_like_me(queue=queue)
     h1 = x._new_like_me(queue=queue)
-    _hankel_01(h0, h1, x, queue=queue)
+    event1 = _hankel_01(h0, h1, x, queue=queue)
+    h0.add_event(event1)
+    h1.add_event(event1)
     return h0, h1
diff --git a/test/test_algorithm.py b/test/test_algorithm.py
index b7b296ce959f2f9d92c290196e0a8f28b6043dc7..5264767c4094806fe43fdfea6056237aed20ade4 100644
--- a/test/test_algorithm.py
+++ b/test/test_algorithm.py
@@ -961,8 +961,14 @@ def test_bitonic_sort(ctx_factory, size, dtype):
     from pyopencl.bitonic_sort import BitonicSort
 
     s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333)
+    sgs = s.copy()
+    # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for
+    # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237
+    if (dev.platform.name == "Portable Computing Language"
+            and cl.get_cl_header_version() < (1, 2)):
+        sgs.finish()
     sorter = BitonicSort(ctx)
-    sgs, evt = sorter(s.copy(), axis=1)
+    sgs, evt = sorter(sgs, axis=1)
     assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
 
 
@@ -1014,7 +1020,14 @@ def test_bitonic_argsort(ctx_factory, size, dtype):
 
     sorterm = BitonicSort(ctx)
 
-    ms, evt = sorterm(m.copy(), idx=index, axis=0)
+    ms = m.copy()
+    # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for
+    # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237
+    if (dev.platform.name == "Portable Computing Language"
+            and cl.get_cl_header_version() < (1, 2)):
+        ms.finish()
+        index.finish()
+    ms, evt = sorterm(ms, idx=index, axis=0)
 
     assert np.array_equal(np.sort(m.get()), ms.get())
 
diff --git a/test/test_array.py b/test/test_array.py
index bca78f5ccbeebf1d86f9ec03a25ded6050bc4097..05008c169ae782a49b5b985c7a79780e337c5770 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -1213,6 +1213,79 @@ def test_multi_put(ctx_factory):
     assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9))
 
 
+def test_outoforderqueue_get(ctx_factory):
+    context = ctx_factory()
+    try:
+        queue = cl.CommandQueue(context,
+               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
+    except Exception:
+        pytest.skip("out-of-order queue not available")
+    a = np.random.rand(10**6).astype(np.dtype('float32'))
+    a_gpu = cl_array.to_device(queue, a)
+    b_gpu = a_gpu + a_gpu**5 + 1
+    b1 = b_gpu.get()  # testing that this waits for events
+    b = a + a**5 + 1
+    assert np.abs(b1 - b).mean() < 1e-5
+
+
+def test_outoforderqueue_copy(ctx_factory):
+    context = ctx_factory()
+    try:
+        queue = cl.CommandQueue(context,
+               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
+    except Exception:
+        pytest.skip("out-of-order queue not available")
+    a = np.random.rand(10**6).astype(np.dtype('float32'))
+    a_gpu = cl_array.to_device(queue, a)
+    c_gpu = a_gpu**2 - 7
+    b_gpu = c_gpu.copy()  # testing that this waits for and creates events
+    b_gpu *= 10
+    queue.finish()
+    b1 = b_gpu.get()
+    b = 10 * (a**2 - 7)
+    assert np.abs(b1 - b).mean() < 1e-5
+
+
+def test_outoforderqueue_indexing(ctx_factory):
+    context = ctx_factory()
+    try:
+        queue = cl.CommandQueue(context,
+               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
+    except Exception:
+        pytest.skip("out-of-order queue not available")
+    a = np.random.rand(10**6).astype(np.dtype('float32'))
+    i = (8e5 + 1e5 * np.random.rand(10**5)).astype(np.dtype('int32'))
+    a_gpu = cl_array.to_device(queue, a)
+    i_gpu = cl_array.to_device(queue, i)
+    c_gpu = (a_gpu**2)[i_gpu - 10000]
+    b_gpu = 10 - a_gpu
+    b_gpu[:] = 8 * a_gpu
+    b_gpu[i_gpu + 10000] = c_gpu - 10
+    queue.finish()
+    b1 = b_gpu.get()
+    c = (a**2)[i - 10000]
+    b = 8 * a
+    b[i + 10000] = c - 10
+    assert np.abs(b1 - b).mean() < 1e-5
+
+
+def test_outoforderqueue_reductions(ctx_factory):
+    context = ctx_factory()
+    try:
+        queue = cl.CommandQueue(context,
+               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
+    except Exception:
+        pytest.skip("out-of-order queue not available")
+    # 0/1 values to avoid accumulated rounding error
+    a = (np.random.rand(10**6) > 0.5).astype(np.dtype('float32'))
+    a[800000] = 10  # all<5 looks true until near the end
+    a_gpu = cl_array.to_device(queue, a)
+    b1 = cl_array.sum(a_gpu).get()
+    b2 = cl_array.dot(a_gpu, 3 - a_gpu).get()
+    b3 = (a_gpu < 5).all().get()
+    assert b1 == a.sum() and b2 == a.dot(3 - a) and b3 == 0
+
+
 if __name__ == "__main__":
     # make sure that import failures get reported, instead of skipping the
     # tests.
diff --git a/test/test_clmath.py b/test/test_clmath.py
index 553ed7a6f6ad4f93abde9e46b0bc5fb60a9066f0..beebc2a8c0ad717e7139c340bff14a07fb77b60c 100644
--- a/test/test_clmath.py
+++ b/test/test_clmath.py
@@ -447,6 +447,23 @@ def test_hankel_01_complex(ctx_factory, ref_src):
         pt.show()
 
 
+def test_outoforderqueue_clmath(ctx_factory):
+    context = ctx_factory()
+    try:
+        queue = cl.CommandQueue(context,
+               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
+    except Exception:
+        pytest.skip("out-of-order queue not available")
+    a = np.random.rand(10**6).astype(np.dtype('float32'))
+    a_gpu = cl_array.to_device(queue, a)
+    # testing that clmath functions wait for and create events
+    b_gpu = clmath.fabs(clmath.sin(a_gpu * 5))
+    queue.finish()
+    b1 = b_gpu.get()
+    b = np.abs(np.sin(a * 5))
+    assert np.abs(b1 - b).mean() < 1e-5
+
+
 if __name__ == "__main__":
     import sys
     if len(sys.argv) > 1: