diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 896b2fcab6ce9ff89bd01155292a39f61d374bf0..eda944d86d2f73200be48646b1c3f498623d39a8 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -213,7 +213,6 @@ class GPUArray(object):
             assert base is None
         else:
             self.gpudata = gpudata
-
         self.base = base
 
         self._grid, self._block = splay(self.mem_size)
@@ -375,7 +374,7 @@ class GPUArray(object):
 
         return out
 
-    def _new_like_me(self, dtype=None):
+    def _new_like_me(self, dtype=None, order="C"):
         strides = None
         if dtype is None:
             dtype = self.dtype
@@ -384,7 +383,7 @@ class GPUArray(object):
                 strides = self.strides
 
         return self.__class__(self.shape, dtype,
-                allocator=self.allocator, strides=strides)
+                allocator=self.allocator, strides=strides, order=order)
 
     # operators ---------------------------------------------------------------
     def mul_add(self, selffac, other, otherfac, add_timer=None, stream=None):
@@ -680,9 +679,12 @@ class GPUArray(object):
 
         return result
 
-    def reshape(self, *shape):
+    def reshape(self, *shape, **kwargs):
         """Gives a new shape to an array without changing its data."""
 
+        # Python 2.x compatibility: use kwargs instead of named 'order' keyword
+        order = kwargs.pop("order", "C")
+
         # TODO: add more error-checking, perhaps
         if not self.flags.forc:
             raise RuntimeError("only contiguous arrays may "
@@ -691,7 +693,10 @@ class GPUArray(object):
         if isinstance(shape[0], tuple) or isinstance(shape[0], list):
             shape = tuple(shape[0])
 
-        if shape == self.shape:
+        same_contiguity = ((order == "C" and self.flags.c_contiguous) or
+                           (order == "F" and self.flags.f_contiguous))
+
+        if shape == self.shape and same_contiguity:
             return self
 
         if -1 in shape:
@@ -712,7 +717,8 @@ class GPUArray(object):
                 dtype=self.dtype,
                 allocator=self.allocator,
                 base=self,
-                gpudata=int(self.gpudata))
+                gpudata=int(self.gpudata),
+                order=order)
 
     def ravel(self):
         return self.reshape(self.size)
@@ -900,8 +906,11 @@ class GPUArray(object):
         if issubclass(dtype.type, np.complexfloating):
             from pytools import match_precision
             real_dtype = match_precision(np.dtype(np.float64), dtype)
-
-            result = self._new_like_me(dtype=real_dtype)
+            if self.flags.f_contiguous:
+                order = "F"
+            else:
+                order = "C"
+            result = self._new_like_me(dtype=real_dtype, order=order)
 
             func = elementwise.get_real_kernel(dtype, real_dtype)
             func.prepared_async_call(self._grid, self._block, None,
@@ -922,8 +931,11 @@ class GPUArray(object):
 
             from pytools import match_precision
             real_dtype = match_precision(np.dtype(np.float64), dtype)
-
-            result = self._new_like_me(dtype=real_dtype)
+            if self.flags.f_contiguous:
+                order = "F"
+            else:
+                order = "C"
+            result = self._new_like_me(dtype=real_dtype, order=order)
 
             func = elementwise.get_imag_kernel(dtype, real_dtype)
             func.prepared_async_call(self._grid, self._block, None,
@@ -941,7 +953,11 @@ class GPUArray(object):
                 raise RuntimeError("only contiguous arrays may "
                         "be used as arguments to this operation")
 
-            result = self._new_like_me()
+            if self.flags.f_contiguous:
+                order = "F"
+            else:
+                order = "C"
+            result = self._new_like_me(order=order)
 
             func = elementwise.get_conj_kernel(dtype)
             func.prepared_async_call(self._grid, self._block, None,
@@ -989,27 +1005,55 @@ empty = GPUArray
 
 def zeros(shape, dtype, allocator=drv.mem_alloc, order="C"):
     """Returns an array of the given shape and dtype filled with 0's."""
-
     result = GPUArray(shape, dtype, allocator, order=order)
     zero = np.zeros((), dtype)
     result.fill(zero)
     return result
 
 
-def empty_like(other_ary):
+def empty_like(other_ary, dtype=None, order='K'):
+    if order == 'K':
+        if other_ary.flags.f_contiguous:
+            order = "F"
+        else:
+            order = "C"
+    if dtype is None:
+        dtype = other_ary.dtype
     result = GPUArray(
-            other_ary.shape, other_ary.dtype, other_ary.allocator)
+            other_ary.shape, dtype, other_ary.allocator, order=order)
     return result
 
 
-def zeros_like(other_ary):
+def zeros_like(other_ary, dtype=None, order='K'):
+    if order == 'K':
+        if other_ary.flags.f_contiguous:
+            order = "F"
+        else:
+            order = "C"
+    if dtype is None:
+        dtype = other_ary.dtype
     result = GPUArray(
-            other_ary.shape, other_ary.dtype, other_ary.allocator)
+            other_ary.shape, dtype, other_ary.allocator, order=order)
     zero = np.zeros((), result.dtype)
     result.fill(zero)
     return result
 
 
+def ones_like(other_ary, dtype=None, order='K'):
+    if order == 'K':
+        if other_ary.flags.f_contiguous:
+            order = "F"
+        else:
+            order = "C"
+    if dtype is None:
+        dtype = other_ary.dtype
+    result = GPUArray(
+            other_ary.shape, dtype, other_ary.allocator, order=order)
+    one = np.ones((), result.dtype)
+    result.fill(one)
+    return result
+
+
 def arange(*args, **kwargs):
     """Create an array filled with numbers spaced `step` apart,
     starting from `start` and ending at `stop`.
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index faae304806b0ec2cac971a88b1eeea7264e07036..39326eedb85a9e31eaa748519f59e1c5a3468630 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -691,6 +691,25 @@ class TestGPUArray:
             assert la.norm(z.get().imag - z.imag.get()) == 0
             assert la.norm(z.get().conj() - z.conj().get()) == 0
 
+            # verify contiguity is preserved
+            for order in ["C", "F"]:
+                # test both zero and non-zero value code paths
+                z_real = gpuarray.zeros(z.shape, dtype=real_dtype,
+                                        order=order)
+                z2 = z.reshape(z.shape, order=order)
+                for zdata in [z_real, z2]:
+                    if order == "C":
+                        assert zdata.flags.c_contiguous == True
+                        assert zdata.real.flags.c_contiguous == True
+                        assert zdata.imag.flags.c_contiguous == True
+                        assert zdata.conj().flags.c_contiguous == True
+                    elif order == "F":
+                        assert zdata.flags.f_contiguous == True
+                        assert zdata.real.flags.f_contiguous == True
+                        assert zdata.imag.flags.f_contiguous == True
+                        assert zdata.conj().flags.f_contiguous == True
+
+
     @mark_cuda_test
     def test_pass_slice_to_kernel(self):
         mod = SourceModule("""
@@ -779,6 +798,17 @@ class TestGPUArray:
             throws_exception = True
         assert throws_exception
 
+        # with order specified
+        a_gpu = a_gpu.reshape((4, 32), order='C')
+        assert a_gpu.flags.c_contiguous
+        a_gpu = a_gpu.reshape(4, 32, order='F')
+        assert a_gpu.flags.f_contiguous
+        a_gpu = a_gpu.reshape((4, 32), order='F')
+        assert a_gpu.flags.f_contiguous
+        # default is C-contiguous
+        a_gpu = a_gpu.reshape((4, 32))
+        assert a_gpu.flags.c_contiguous
+
     @mark_cuda_test
     def test_view(self):
         a = np.arange(128).reshape(8, 16).astype(np.float32)