diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 6df39063e9919a4518f4a3f41e149265ec21db25..8228860ec058e3b1252dcb184577f28a8f88b434 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -389,7 +389,7 @@ class GPUArray:
         return self.get(ary=ary, async_=True, stream=stream)
 
     def copy(self):
-        new = GPUArray(self.shape, self.dtype, self.allocator)
+        new = GPUArray(self.shape, self.dtype, self.allocator, strides=self.strides)
         _memcpy_discontig(new, self)
         return new
 
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index fd075dccf13bec7aabe967f9f361cc7f35dbc5cb..ea641d093970c520b9cbf529328758a2ff4677f4 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -1546,6 +1546,16 @@ class TestGPUArray:
         d2 = d[:, 7:9, :]  # non C-contiguous
         d2.transpose(axes=(1, 0, 2))  # crashes for recent versions
 
+    def test_copy_strides(self):
+        # https://github.com/inducer/pycuda/issues/403
+        a = np.random.randn(22, 33).copy(order="f")
+        a_dev = gpuarray.to_gpu(a)
+        assert a_dev.strides == a.strides
+        a_dev_2 = a_dev.copy()
+        assert a_dev_2.strides == a.strides
+        a_back = a_dev_2.get()
+        assert np.array_equal(a_back, a)
+
 
 if __name__ == "__main__":
     # make sure that import failures get reported, instead of skipping the tests.