diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 896b2fcab6ce9ff89bd01155292a39f61d374bf0..eda944d86d2f73200be48646b1c3f498623d39a8 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -213,7 +213,6 @@ class GPUArray(object): assert base is None else: self.gpudata = gpudata - self.base = base self._grid, self._block = splay(self.mem_size) @@ -375,7 +374,7 @@ class GPUArray(object): return out - def _new_like_me(self, dtype=None): + def _new_like_me(self, dtype=None, order="C"): strides = None if dtype is None: dtype = self.dtype @@ -384,7 +383,7 @@ class GPUArray(object): strides = self.strides return self.__class__(self.shape, dtype, - allocator=self.allocator, strides=strides) + allocator=self.allocator, strides=strides, order=order) # operators --------------------------------------------------------------- def mul_add(self, selffac, other, otherfac, add_timer=None, stream=None): @@ -680,9 +679,12 @@ class GPUArray(object): return result - def reshape(self, *shape): + def reshape(self, *shape, **kwargs): """Gives a new shape to an array without changing its data.""" + # Python 2.x compatibility: use kwargs instead of named 'order' keyword + order = kwargs.pop("order", "C") + # TODO: add more error-checking, perhaps if not self.flags.forc: raise RuntimeError("only contiguous arrays may " @@ -691,7 +693,10 @@ class GPUArray(object): if isinstance(shape[0], tuple) or isinstance(shape[0], list): shape = tuple(shape[0]) - if shape == self.shape: + same_contiguity = ((order == "C" and self.flags.c_contiguous) or + (order == "F" and self.flags.f_contiguous)) + + if shape == self.shape and same_contiguity: return self if -1 in shape: @@ -712,7 +717,8 @@ class GPUArray(object): dtype=self.dtype, allocator=self.allocator, base=self, - gpudata=int(self.gpudata)) + gpudata=int(self.gpudata), + order=order) def ravel(self): return self.reshape(self.size) @@ -900,8 +906,11 @@ class GPUArray(object): if issubclass(dtype.type, np.complexfloating): from pytools import match_precision real_dtype = match_precision(np.dtype(np.float64), dtype) - - result = self._new_like_me(dtype=real_dtype) + if self.flags.f_contiguous: + order = "F" + else: + order = "C" + result = self._new_like_me(dtype=real_dtype, order=order) func = elementwise.get_real_kernel(dtype, real_dtype) func.prepared_async_call(self._grid, self._block, None, @@ -922,8 +931,11 @@ class GPUArray(object): from pytools import match_precision real_dtype = match_precision(np.dtype(np.float64), dtype) - - result = self._new_like_me(dtype=real_dtype) + if self.flags.f_contiguous: + order = "F" + else: + order = "C" + result = self._new_like_me(dtype=real_dtype, order=order) func = elementwise.get_imag_kernel(dtype, real_dtype) func.prepared_async_call(self._grid, self._block, None, @@ -941,7 +953,11 @@ class GPUArray(object): raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") - result = self._new_like_me() + if self.flags.f_contiguous: + order = "F" + else: + order = "C" + result = self._new_like_me(order=order) func = elementwise.get_conj_kernel(dtype) func.prepared_async_call(self._grid, self._block, None, @@ -989,27 +1005,55 @@ empty = GPUArray def zeros(shape, dtype, allocator=drv.mem_alloc, order="C"): """Returns an array of the given shape and dtype filled with 0's.""" - result = GPUArray(shape, dtype, allocator, order=order) zero = np.zeros((), dtype) result.fill(zero) return result -def empty_like(other_ary): +def empty_like(other_ary, dtype=None, order='K'): + if order == 'K': + if other_ary.flags.f_contiguous: + order = "F" + else: + order = "C" + if dtype is None: + dtype = other_ary.dtype result = GPUArray( - other_ary.shape, other_ary.dtype, other_ary.allocator) + other_ary.shape, dtype, other_ary.allocator, order=order) return result -def zeros_like(other_ary): +def zeros_like(other_ary, dtype=None, order='K'): + if order == 'K': + if other_ary.flags.f_contiguous: + order = "F" + else: + order = "C" + if dtype is None: + dtype = other_ary.dtype result = GPUArray( - other_ary.shape, other_ary.dtype, other_ary.allocator) + other_ary.shape, dtype, other_ary.allocator, order=order) zero = np.zeros((), result.dtype) result.fill(zero) return result +def ones_like(other_ary, dtype=None, order='K'): + if order == 'K': + if other_ary.flags.f_contiguous: + order = "F" + else: + order = "C" + if dtype is None: + dtype = other_ary.dtype + result = GPUArray( + other_ary.shape, dtype, other_ary.allocator, order=order) + one = np.ones((), result.dtype) + result.fill(one) + return result + + def arange(*args, **kwargs): """Create an array filled with numbers spaced `step` apart, starting from `start` and ending at `stop`. diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index faae304806b0ec2cac971a88b1eeea7264e07036..39326eedb85a9e31eaa748519f59e1c5a3468630 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -691,6 +691,25 @@ class TestGPUArray: assert la.norm(z.get().imag - z.imag.get()) == 0 assert la.norm(z.get().conj() - z.conj().get()) == 0 + # verify contiguity is preserved + for order in ["C", "F"]: + # test both zero and non-zero value code paths + z_real = gpuarray.zeros(z.shape, dtype=real_dtype, + order=order) + z2 = z.reshape(z.shape, order=order) + for zdata in [z_real, z2]: + if order == "C": + assert zdata.flags.c_contiguous == True + assert zdata.real.flags.c_contiguous == True + assert zdata.imag.flags.c_contiguous == True + assert zdata.conj().flags.c_contiguous == True + elif order == "F": + assert zdata.flags.f_contiguous == True + assert zdata.real.flags.f_contiguous == True + assert zdata.imag.flags.f_contiguous == True + assert zdata.conj().flags.f_contiguous == True + + @mark_cuda_test def test_pass_slice_to_kernel(self): mod = SourceModule(""" @@ -779,6 +798,17 @@ class TestGPUArray: throws_exception = True assert throws_exception + # with order specified + a_gpu = a_gpu.reshape((4, 32), order='C') + assert a_gpu.flags.c_contiguous + a_gpu = a_gpu.reshape(4, 32, order='F') + assert a_gpu.flags.f_contiguous + a_gpu = a_gpu.reshape((4, 32), order='F') + assert a_gpu.flags.f_contiguous + # default is C-contiguous + a_gpu = a_gpu.reshape((4, 32)) + assert a_gpu.flags.c_contiguous + @mark_cuda_test def test_view(self): a = np.arange(128).reshape(8, 16).astype(np.float32)