diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5ea9ce16b28d9de2fa705ce5412828b1b0dd3c11..0339acc2414705e6f81fe44b3c2b15f4869d49db 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -40,6 +40,20 @@ Python 3.5 Titan:
   except:
   - tags
 
+Python 3.7 Titan:
+  script:
+  - py_version=3.7
+  - EXTRA_INSTALL="cython git+https://github.com/numpy/numpy.git@d233e1f4c176de8b1bf1365aac48caa10610a402 mako"
+  - echo "CUDADRV_LIB_DIR = ['/usr/lib/x86_64-linux-gnu/nvidia/current']" > siteconf.py
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
+  - "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH"
+  - ". ./build-and-test-py-project.sh"
+  tags:
+  - python3.7
+  - nvidia-titan-x
+  except:
+  - tags
+
 Python 3.5 K40:
   script:
   - py_version=3.5
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 99b63969ad25bd78c762aa70c9ecb917a22360e4..5f96787cb3bb5cbbff058962f7dcd8d0107e99bc 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -227,7 +227,7 @@ class GPUArray(object):
     def flags(self):
         return _ArrayFlags(self)
 
-    def set(self, ary, async=False, stream=None):
+    def set(self, ary, asynchronous=False, stream=None):
         if ary.size != self.size:
             raise ValueError("ary and self must be the same size")
         if ary.shape != self.shape:
@@ -240,12 +240,12 @@ class GPUArray(object):
             raise ValueError("ary and self must have the same dtype")
 
         if self.size:
-            _memcpy_discontig(self, ary, async=async, stream=stream)
+            _memcpy_discontig(self, ary, asynchronous=asynchronous, stream=stream)
 
     def set_async(self, ary, stream=None):
-        return self.set(ary, async=True, stream=stream)
+        return self.set(ary, asynchronous=True, stream=stream)
 
-    def get(self, ary=None, pagelocked=False, async=False, stream=None):
+    def get(self, ary=None, pagelocked=False, asynchronous=False, stream=None):
         if ary is None:
             if pagelocked:
                 ary = drv.pagelocked_empty(self.shape, self.dtype)
@@ -268,11 +268,11 @@ class GPUArray(object):
                 raise TypeError("self and ary must have the same dtype")
 
         if self.size:
-            _memcpy_discontig(ary, self, async=async, stream=stream)
+            _memcpy_discontig(ary, self, asynchronous=asynchronous, stream=stream)
         return ary
 
     def get_async(self, stream=None, ary=None):
-        return self.get(ary=ary, async=True, stream=stream)
+        return self.get(ary=ary, asynchronous=True, stream=stream)
 
     def copy(self):
         new = GPUArray(self.shape, self.dtype, self.allocator)
@@ -1195,7 +1195,7 @@ def _compact_strides(a):
     return strides
 
 
-def _memcpy_discontig(dst, src, async=False, stream=None):
+def _memcpy_discontig(dst, src, asynchronous=False, stream=None):
     """Copy the contents of src into dst.
 
     The two arrays should have the same dtype, shape, and order, but
@@ -1256,7 +1256,7 @@ def _memcpy_discontig(dst, src, async=False, stream=None):
     if len(shape) <= 1:
         if isinstance(src, GPUArray):
             if isinstance(dst, GPUArray):
-                if async:
+                if asynchronous:
                     drv.memcpy_dtod_async(dst.gpudata, src.gpudata, src.nbytes, stream=stream)
                 else:
                     drv.memcpy_dtod(dst.gpudata, src.gpudata, src.nbytes)
@@ -1266,13 +1266,13 @@ def _memcpy_discontig(dst, src, async=False, stream=None):
                 # so that the order is neither Fortran or C.
                 # So, we attempt to get a contiguous view of dst.
                 dst = _as_strided(dst, shape=(dst.size,), strides=(dst.dtype.itemsize,))
-                if async:
+                if asynchronous:
                     drv.memcpy_dtoh_async(dst, src.gpudata, stream=stream)
                 else:
                     drv.memcpy_dtoh(dst, src.gpudata)
         else:
             src = _as_strided(src, shape=(src.size,), strides=(src.dtype.itemsize,))
-            if async:
+            if asynchronous:
                 drv.memcpy_htod_async(dst.gpudata, src, stream=stream)
             else:
                 drv.memcpy_htod(dst.gpudata, src)
@@ -1302,7 +1302,7 @@ def _memcpy_discontig(dst, src, async=False, stream=None):
     copy.height = shape[1]
 
     if len(shape) == 2:
-        if async:
+        if asynchronous:
             copy(stream)
         else:
             copy(aligned=True)
@@ -1317,7 +1317,7 @@ def _memcpy_discontig(dst, src, async=False, stream=None):
         copy.dst_height = dst_strides[2] // dst_strides[1]
 
         copy.depth = shape[2]
-        if async:
+        if asynchronous:
             copy(stream)
         else:
             copy()