diff --git a/doc/array.rst b/doc/array.rst
index de1cc66ed3b1ecc09b4b999a95b04461215a9b55..34c65693e8c4244c5a6c401eb35d4573a56b0971 100644
--- a/doc/array.rst
+++ b/doc/array.rst
@@ -339,6 +339,15 @@ Constructing :class:`GPUArray` Instances
     Return the :class:`GPUArray` ``[a[indices[0]], ..., a[indices[n]]]``.
     For the moment, *a* must be a type that can be bound to a texture.
 
+.. function:: concatenate(arrays, axis=0, allocator=None)
+
+    Join a sequence of arrays along an existing axis.
+
+.. function:: stack(arrays, axis=0, allocator=None)
+
+    Join a sequence of arrays along a new axis.
+
+
 Conditionals
 ^^^^^^^^^^^^
 
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 373cf00532b51c7e870d2e0c09e092e418e74378..a0bf84c4d42f93ba2215c895f7e3746a2d9b177a 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1767,6 +1767,95 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, stream=None):
 
 # {{{ shape manipulation
 
+def concatenate(arrays, axis=0, allocator=None):
+    """
+    Join a sequence of arrays along an existing axis.
+    :arg arrays: A sequnce of :class:`GPUArray`.
+    :arg axis: Index of the dimension of the new axis in the result array.
+        Can be -1, for the new axis to be last dimension.
+    :returns: :class:`GPUArray`
+    """
+    # implementation is borrowed from pyopencl.array.concatenate()
+    # {{{ find properties of result array
+
+    shape = None
+
+    def shape_except_axis(ary: GPUArray):
+        return ary.shape[:axis] + ary.shape[axis+1:]
+
+    for i_ary, ary in enumerate(arrays):
+        allocator = allocator or ary.allocator
+
+        if shape is None:
+            # first array
+            shape = list(ary.shape)
+
+        else:
+            if len(ary.shape) != len(shape):
+                raise ValueError("%d'th array has different number of axes "
+                        "(should have %d, has %d)"
+                        % (i_ary, len(ary.shape), len(shape)))
+
+            if (ary.ndim != arrays[0].ndim
+                    or shape_except_axis(ary) != shape_except_axis(arrays[0])):
+                raise ValueError("%d'th array has residual not matching "
+                        "other arrays" % i_ary)
+
+            shape[axis] += ary.shape[axis]
+
+    # }}}
+
+    shape = tuple(shape)
+    dtype = np.find_common_type([ary.dtype for ary in arrays], [])
+    result = empty(shape, dtype, allocator=allocator)
+
+    full_slice = (slice(None),) * len(shape)
+
+    base_idx = 0
+    for ary in arrays:
+        my_len = ary.shape[axis]
+        result[full_slice[:axis] + (slice(base_idx, base_idx+my_len),) + full_slice[axis+1:]] = ary
+        base_idx += my_len
+
+    return result
+
+
+def stack(arrays, axis=0, allocator=None):
+    """
+    Join a sequence of arrays along a new axis.
+    :arg arrays: A sequnce of :class:`GPUArray`.
+    :arg axis: Index of the dimension of the new axis in the result array.
+        Can be -1, for the new axis to be last dimension.
+    :returns: :class:`GPUArray`
+    """
+    # implementation is borrowed from pyopencl.array.stack()
+    allocator = allocator or arrays[0].allocator
+
+    if not arrays:
+        raise ValueError("need at least one array to stack")
+
+    input_shape = arrays[0].shape
+    input_ndim = arrays[0].ndim
+    axis = input_ndim if axis == -1 else axis
+
+    if not all(ary.shape == input_shape for ary in arrays[1:]):
+        raise ValueError("arrays must have the same shape")
+
+    if not (0 <= axis <= input_ndim):
+        raise ValueError("invalid axis")
+
+    result_shape = input_shape[:axis] + (len(arrays),) + input_shape[axis:]
+    result = empty(shape=result_shape,
+            dtype=np.result_type(*(ary.dtype for ary in arrays)),
+            allocator=allocator, order="C" if axis == 0 else "F")
+
+    for i, ary in enumerate(arrays):
+
+        idx = (slice(None),)*axis + (i,) + (slice(None),)*(input_ndim-axis)
+        result[idx] = ary
+
+    return result
+
 
 def transpose(a, axes=None):
     """Permute the dimensions of an array.
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index 4b53aca7ba62d8508eff6cf8829342529339a158..dbf4b2f7f2ebee4fbec4a810b2ff3f8437d99458 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -468,6 +468,53 @@ class TestGPUArray:
         a = gpuarray.arange(12, dtype=np.float32)
         assert (np.arange(12, dtype=np.float32) == a.get()).all()
 
+    @mark_cuda_test
+    def test_stack(self):
+
+        orders = ["F", "C"]
+        input_dims_lst = [0, 1, 2]
+
+        for order in orders:
+            for input_dims in input_dims_lst:
+                shape = (2, 2, 2)[:input_dims]
+                axis = -1 if order == "F" else 0
+
+                from numpy.random import default_rng
+                rng = default_rng()
+                x_in = rng.random(size=shape)
+                y_in = rng.random(size=shape)
+                x_in = x_in if order == "C" else np.asfortranarray(x_in)
+                y_in = y_in if order == "C" else np.asfortranarray(y_in)
+
+                x_gpu = gpuarray.to_gpu(x_in)
+                y_gpu = gpuarray.to_gpu(y_in)
+
+                numpy_stack = np.stack((x_in, y_in), axis=axis)
+                gpuarray_stack = gpuarray.stack((x_gpu, y_gpu), axis=axis)
+
+                np.testing.assert_allclose(gpuarray_stack.get(), numpy_stack)
+
+                assert gpuarray_stack.shape == numpy_stack.shape
+
+    @mark_cuda_test
+    def test_concatenate(self):
+
+        from pycuda.curandom import rand as curand
+
+        a_dev = curand((5, 15, 20), dtype=np.float32)
+        b_dev = curand((4, 15, 20), dtype=np.float32)
+        c_dev = curand((3, 15, 20), dtype=np.float32)
+        a = a_dev.get()
+        b = b_dev.get()
+        c = c_dev.get()
+
+        cat_dev = gpuarray.concatenate((a_dev, b_dev, c_dev))
+        cat = np.concatenate((a, b, c))
+
+        np.testing.assert_allclose(cat, cat_dev.get())
+
+        assert cat.shape == cat_dev.shape
+
     @mark_cuda_test
     def test_reverse(self):
         a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)