diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 36a39dc3d16e6fae90cf5ec5aabb269457a234d3..298771346360221958b7ad8826ccc472705f83fd 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -228,6 +228,18 @@ class GPUArray:
                 strides = _f_contiguous_strides(dtype.itemsize, shape)
             elif order == "C":
                 strides = _c_contiguous_strides(dtype.itemsize, shape)
+            elif order == "A":
+                # similar to https://numpy.org/doc/stable/reference/generated/numpy.array.html
+                if base.flags.f_contiguous:
+                    strides = _f_contiguous_strides(dtype.itemsize, shape)
+                else:
+                    strides = _c_contiguous_strides(dtype.itemsize, shape)
+            elif order == 'K':
+                # refer to https://github.com/numpy/numpy/blob/9b84c1174125cb32a6be1bb6151782f8b2beda55/doc/neps/nep-0010-new-iterator-ufunc.rst
+                 if base.flags.c_contiguous:
+                    strides = _c_contiguous_strides(dtype.itemsize, shape)
+                 else:
+                    strides = _f_contiguous_strides(dtype.itemsize, shape)
             else:
                 raise ValueError("invalid order: %s" % order)
         else:
@@ -895,7 +907,9 @@ class GPUArray:
             shape = tuple(shape[0])
 
         same_contiguity = (order == "C" and self.flags.c_contiguous) or (
-            order == "F" and self.flags.f_contiguous
+            order == "F" and self.flags.f_contiguous) or (
+            order == "A" and (self.flags.c_contiguous or self.flags.f_contiguous)) or (
+            order == "K" and (self.flags.c_contiguous or self.flags_f_contiguous)
         )
 
         if shape == self.shape and same_contiguity:
@@ -919,7 +933,7 @@ class GPUArray:
             dtype=self.dtype,
             allocator=self.allocator,
             base=self,
-            gpudata=int(self.gpudata),
+            gpudata=0 if self.gpudata == None else int(self.gpudata),
             order=order,
         )
 
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index 73ec3ade3f99e8830882b6271aadd70945e4b4f3..4d956b752b8929e6f36edcbac81ff4de66d309c4 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -913,12 +913,28 @@ class TestGPUArray:
         assert throws_exception
 
         # with order specified
-        a_gpu = a_gpu.reshape((4, 32), order="C")
-        assert a_gpu.flags.c_contiguous
-        a_gpu = a_gpu.reshape(4, 32, order="F")
-        assert a_gpu.flags.f_contiguous
-        a_gpu = a_gpu.reshape((4, 32), order="F")
-        assert a_gpu.flags.f_contiguous
+        a_gpu_C = a_gpu.reshape((4, 32), order="C")
+        a_C = a.reshape((4, 32), order="C")
+        np.testing.assert_allclose(a_gpu_C.get(), a_C)
+        a_gpu_NC = a_gpu_C.T
+        a_gpu_new_F = a_gpu_NC.reshape(128)
+        a_NC = a_C.T
+        a_new_F = a_NC.reshape(128)
+        np.testing.assert_allclose(a_gpu_new_F.get(), a_new_F)
+        a_gpu_F = a_gpu.reshape((4, 32), order="F")
+        a_F = a.reshape((4, 32), order="F")
+        np.testing.assert_allclose(a_gpu_F.get(), a_F)
+        a_gpu_NF = a_gpu_F.T
+        a_gpu_new_C = a_gpu_NF.reshape(128)
+        a_NF = a_F.T
+        a_new_C = a_NF.reshape(128)
+        np.testing.assert_allclose(a_gpu_new_C.get(), a_new_C)
+        a_gpu_A = a_gpu.reshape((4, 32), order="A")
+        a_A = a.reshape((4, 32), order="A")
+        np.testing.assert_allclose(a_gpu_A.get(), a_A)
+        a_gpu_K = a_gpu.reshape((4, 32), order="K")
+        a_K = a.reshape((4, 32), order="K")
+        np.testing.assert_allclose(a_gpu_K.get(), a_K)
         # default is C-contiguous
         a_gpu = a_gpu.reshape((4, 32))
         assert a_gpu.flags.c_contiguous