diff --git a/pyopencl/array.py b/pyopencl/array.py
index c1b132cc644dfb882ae63f1712a39cebef0dca62..72d2a0cfb6a8a68b14c202e0c4f8a16219383012 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -726,8 +726,8 @@ class Array(object):
 
     def get_async(self, queue=None, ary=None, **kwargs):
         """
-        Asynchronous version of :meth:`get`, following the same calling convention
-        while returning a tuple ``(ary, event)`` containing the host array `ary`
+        Asynchronous version of :meth:`get` which returns a tuple ``(ary, event)``
+        containing the host array `ary`
         and the :class:`pyopencl.NannyEvent` `event` returned by
         :meth:`pyopencl.enqueue_copy`.
         """
diff --git a/test/test_array.py b/test/test_array.py
index 02e43e2481f4b3bdcdbc25f4c682dbd0043052b0..cf63fc1437a718d80200facd25d5d73e19d34077 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -1217,6 +1217,32 @@ def test_multi_put(ctx_factory):
     assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9))
 
 
+def test_get_async(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    a = np.random.rand(10**6).astype(np.dtype('float32'))
+    a_gpu = cl_array.to_device(queue, a)
+    b = a + a**5 + 1
+    b_gpu = a_gpu + a_gpu**5 + 1
+
+    # deprecated, but still test
+    b1 = b_gpu.get(async_=True)  # testing that this waits for events
+    b_gpu.finish()
+    assert np.abs(b1 - b).mean() < 1e-5
+
+    b1 = b_gpu.get_async()  # testing that this waits for events
+    b_gpu.finish()
+    assert np.abs(b1 - b).mean() < 1e-5
+
+    wait_event = cl.UserEvent(context)
+    b_gpu.add_event(wait_event)
+    b = b_gpu.get_async()  # testing that this doesn't hang
+    wait_event.set_status(cl.command_execution_status.COMPLETE)
+    b_gpu.finish()
+    assert np.abs(b1 - b).mean() < 1e-5
+
+
 def test_outoforderqueue_get(ctx_factory):
     context = ctx_factory()
     try: