diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 56cf3886e3b56be7b10cbd90ebecbb4917437724..ea72e3a1b8926360f9f0ec1f683cc937b5b428b0 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -15,6 +15,21 @@ Python 3 POCL:
       junit: test/pytest.xml
 
 
+Python 3 POCL Examples:
+  script:
+  - export PY_EXE=python3
+  - export PYOPENCL_TEST=portable:pthread
+  - export EXTRA_INSTALL="pyopencl"
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-py-project-and-run-examples.sh
+  - ". ./build-py-project-and-run-examples.sh"
+  tags:
+  - python3
+  - pocl
+  - large-node
+  except:
+  - tags
+
+
 Pylint:
   script:
   - export PY_EXE=python3
diff --git a/doc/design.rst b/doc/design.rst
index fd50da9160a4bc4b8cb7531a38a31d0dc1d7277d..5114de67c23f18365d7bee7aba5deb54b94b1402 100644
--- a/doc/design.rst
+++ b/doc/design.rst
@@ -121,6 +121,10 @@ Reserved Identifiers
 
     -   ``_pt_out``: The default name of an unnamed output argument
 
+    -   ``_pt_data``: Used to automatically generate identifiers for
+        names of :class:`~pytato.array.DataWrapper` arguments that are
+        not supplied by the user.
+
 -   Identifiers used in index lambdas are also reserved. These include:
 
     -   Identifiers matching the regular expression ``_[0-9]+``. They are used
diff --git a/examples/advection.py b/examples/advection.py
new file mode 100755
index 0000000000000000000000000000000000000000..a599b057a27669073143268156ac1c2fafbe7c48
--- /dev/null
+++ b/examples/advection.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python
+import numpy as np
+import pytest
+import functools
+from dg_tools import DGDiscr1D, integrate, DGOps1D, DGOps1DRef
+
+memoized = functools.lru_cache(maxsize=None)
+
+
+class AdvectionOperator(object):
+    """A class representing a DG advection operator."""
+
+    def __init__(self, discr, c, flux_type, dg_ops):
+        """
+        Inputs:
+            discr: an instance of DGDiscr1D
+            c: advection speed parameter
+            flux_type: "upwind" or "central"
+            dg_ops: An instance of AbstractDGOps1D
+        """
+        self.discr = discr
+        self.c = c
+        assert flux_type in ("upwind", "central")
+        self.flux_type = flux_type
+        self.dg = dg_ops
+
+    def weak_flux(self, vec):
+        """Apply the flux, weak form.
+
+        Inputs:
+            dg: a DGOps1D instance
+            vec: vector of nodal values at the faces
+
+        Signature: (m, 2) -> (m, 2)
+        """
+        if self.flux_type == "central":
+            flux = (vec + self.dg.face_swap(vec)) / 2
+
+        elif self.flux_type == "upwind":
+            swp = self.dg.face_swap(vec)
+            if self.c >= 0:
+                flux = self.dg.array_ops.stack((vec[:, 0], swp[:, 1]), axis=1)
+            else:
+                flux = self.dg.array_ops.stack((swp[:, 0], vec[:, 1]), axis=1)
+
+        flux = flux * self.c * self.dg.normals
+
+        return flux
+
+    def strong_flux(self, vec):
+        """Apply the flux, strong form.
+
+        Inputs:
+            dg: a DGOps1D instance
+            vec: vector of nodal values at the faces
+
+        Signature: (m, 2) -> (m, 2)
+        """
+        return self.c * self.dg.normals * vec - self.weak_flux(vec)
+
+    def apply(self, vec):
+        """Main operator implementation.
+
+        Signature: (m, n) -> (m, n)
+        """
+        dg = self.dg
+        return -dg.inv_mass(
+                dg.face_mass(self.strong_flux(dg.interp(vec)))
+                - self.c * dg.stiffness(vec))
+
+    def __call__(self, vec):
+        """Apply the DG advection operator to the vector of degrees of freedom.
+
+        Signature: (m*n,) -> (m*n,)
+        """
+        vec = vec.reshape((self.discr.nelements, self.discr.nnodes))
+        return self.apply(vec).reshape((-1,))
+
+
+def rk4(rhs, initial, t_initial, t_final, dt):
+    """RK4 integrator.
+
+    Inputs:
+        - rhs: a callable that takes arguments (t, y)
+        - initial: initial value
+        - t_initial: initial time
+        - t_final: final time
+        - dt: step size
+
+    Returns:
+        The solution computed at the final time.
+    """
+    t = t_initial
+    sol = initial
+
+    while t < t_final:
+        dt = min(dt, t_final - t)
+        s0 = rhs(t, sol)
+        s1 = rhs(t + dt/2, sol + dt/2 * s0)
+        s2 = rhs(t + dt/2, sol + dt/2 * s1)
+        s3 = rhs(t + dt, sol + dt * s2)
+        sol = sol + dt / 6 * (s0 + 2 * s1 + 2 * s2 + s3)
+        t += dt
+
+    return sol
+
+
+def test_rk4():
+    assert np.isclose(rk4(lambda t, y: -y, 1, 0, 5, 0.01), np.exp(-5))
+
+
+@pytest.mark.parametrize("order", (3, 4, 5))
+@pytest.mark.parametrize("flux_type", ("central", "upwind"))
+def test_ref_advection_convergence(order, flux_type):
+    errors = []
+    hs = []
+
+    for nelements in (8, 12, 16, 20):
+        discr = DGDiscr1D(0, 2*np.pi, nelements=nelements, nnodes=order)
+        u_initial = np.sin(discr.nodes())
+        op = AdvectionOperator(
+                discr, c=1, flux_type=flux_type, dg_ops=DGOps1DRef(discr))
+        u = rk4(lambda t, y: op(y), u_initial, t_initial=0, t_final=np.pi,
+                dt=0.01)
+        u_ref = -u_initial
+        hs.append(discr.h)
+        errors.append(integrate(discr, (u - u_ref)**2)**0.5)
+
+    eoc, _ = np.polyfit(np.log(hs), np.log(errors), 1)
+    assert eoc >= order - 0.1, eoc
+
+
+@pytest.mark.parametrize("order", (3, 4, 5))
+@pytest.mark.parametrize("flux_type", ("central", "upwind"))
+def test_advection_convergence(order, flux_type):
+    errors = []
+    hs = []
+
+    import pytato as pt
+    import pyopencl as cl
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+
+    for nelements in (8, 12, 16, 20):
+        discr = DGDiscr1D(0, 2*np.pi, nelements=nelements, nnodes=order)
+        u_initial = np.sin(discr.nodes())
+
+        ns = pt.Namespace()
+        pt.make_size_param(ns, "nelements")
+        pt.make_size_param(ns, "nnodes")
+        u = pt.make_placeholder(ns,
+                "u", shape="(nelements, nnodes)", dtype=np.float64)
+        op = AdvectionOperator(discr, c=1, flux_type=flux_type,
+                dg_ops=DGOps1D(discr, ns))
+        result = op.apply(u)
+
+        prog = pt.generate_loopy(result, target=pt.PyOpenCLTarget(queue))
+
+        u = rk4(lambda t, y: prog(
+                u=y.reshape(nelements, order))[1][0].reshape(-1),
+                u_initial, t_initial=0, t_final=np.pi, dt=0.01)
+        u_ref = -u_initial
+        hs.append(discr.h)
+        errors.append(integrate(discr, (u - u_ref)**2)**0.5)
+
+    eoc, _ = np.polyfit(np.log(hs), np.log(errors), 1)
+    assert eoc >= order - 0.1, eoc
+
+
+def main():
+    import pytato as pt
+    import pyopencl as cl
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+
+    nelements = 20
+    nnodes = 3
+
+    discr = DGDiscr1D(0, 2*np.pi, nelements=nelements, nnodes=nnodes)
+
+    ns = pt.Namespace()
+    pt.make_size_param(ns, "nelements")
+    pt.make_size_param(ns, "nnodes")
+    u = pt.make_placeholder(ns,
+            "u", shape="(nelements, nnodes)", dtype=np.float64)
+    op = AdvectionOperator(discr, c=1, flux_type="central",
+            dg_ops=DGOps1D(discr, ns))
+    result = op.apply(u)
+
+    prog = pt.generate_loopy(result, target=pt.PyOpenCLTarget(queue))
+    print(prog.program)
+
+    u = np.sin(discr.nodes())
+    print(prog(u=u.reshape(nelements, nnodes))[1][0])
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
+    main()
diff --git a/examples/dg_tools.py b/examples/dg_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..47ee6fb55b6f90e83bb4a47a9dd3fa73bad2a35b
--- /dev/null
+++ b/examples/dg_tools.py
@@ -0,0 +1,418 @@
+import numpy as np
+import numpy.polynomial.legendre as leg
+import numpy.linalg as la
+import contextlib
+
+__doc__ = """
+Notation convention for operator shapes
+=======================================
+
+* m - number of elements in the discretization
+* n - number of volume degrees of freedom per element
+"""
+
+import functools
+
+import pytato as pt
+
+
+memoized = functools.lru_cache(maxsize=None)
+
+
+def ortholegvander(x, deg):
+    """See numpy.polynomial.legendre.legvander(). Uses an orthonormal basis."""
+    result = leg.legvander(x, deg)
+    factors = np.array([np.sqrt((2*n+1)/2) for n in range(0, 1 + deg)])
+    return result * factors
+
+
+def ortholegder(c):
+    """See numpy.polynomial.legendre.legder(). Uses an orthonormal basis."""
+    fw_factors = np.array([np.sqrt((2*n+1)/2) for n in range(len(c))])
+    derivs = leg.legder(c * fw_factors)
+    return derivs / fw_factors[:len(derivs)]
+
+
+def ortholegval(x, c):
+    """See numpy.polynomial.legendre.legval(). Uses an orthonormal basis."""
+    factors = np.array([np.sqrt((2*n+1)/2) for n in range(len(c))])
+    return leg.legval(x, c * factors)
+
+
+class DGDiscr1D(object):
+    """A one-dimensional Discontinuous Galerkin discretization."""
+
+    def __init__(self, left, right, nelements, nnodes):
+        """
+        Inputs:
+            left - left endpoint
+            right - right endpoint
+            nelements - number of discretization panels
+            nnodes - number of degrees of freedom per panel
+        """
+        self.left = left
+        self.right = right
+        self.nelements = nelements
+        self.nnodes = nnodes
+
+    @property
+    @memoized
+    def ref_nodes(self):
+        """Return reference nodes for a single element.
+
+        Signature: ->(n,)
+        """
+        nodes, _ = leg.leggauss(self.nnodes)
+        return nodes
+
+    @property
+    @memoized
+    def ref_weights(self):
+        """Return reference quadrature weights for a single element.
+
+        Signature: ->(n,)
+        """
+        _, weights = leg.leggauss(self.nnodes)
+        return weights
+
+    def zeros(self):
+        """Return a zero solution.
+
+        Signature: ->(n*m,)
+        """
+        return np.zeros(self.nnodes * self.nelements)
+
+    @property
+    def h(self):
+        """Return the element size.
+
+        Signature: ->()
+        """
+        return self.elements[0,1] - self.elements[0,0]
+
+    def nodes(self):
+        """Return the vector of node coordinates.
+
+        Signature: ->(n*m,)
+        """
+        centers = (self.elements[:,0] + self.elements[:,1]) / 2
+        radii = (self.elements[:,1] - self.elements[:,0]) / 2
+        return ((self.ref_nodes[:,np.newaxis] * radii) + centers).T.ravel()
+
+    @property
+    @memoized
+    def vdm(self):
+        """Return the elementwise Vandermonde (modal-to-nodal) matrix.
+
+        Signature: ->(n, n)
+        """
+        return ortholegvander(self.ref_nodes, self.nnodes - 1)
+
+    @property
+    @memoized
+    def _ref_mass(self):
+        """Return the (volume) mass matrix for the reference element.
+
+        Signature: ->(n, n)
+        """
+        return la.inv(self.vdm @ self.vdm.T)
+
+    @property
+    @memoized
+    def mass(self):
+        """Return the elementwise volume mass matrix.
+
+        Signature: ->(n, n)
+        """
+        h = (self.right - self.left) / self.nelements
+        return (h/2) * self._ref_mass
+
+    @property
+    @memoized
+    def inv_mass(self):
+        """Return the inverse of the elementwise volume mass matrix.
+
+        Signature: ->(n, n)
+        """
+        return la.inv(self.mass)
+
+    @property
+    @memoized
+    def face_mass(self):
+        """Return the face mass matrix.
+
+        The face mass matrix combines the effects of applying the face mass
+        operator on each face and interpolating the output to the volume nodes.
+
+        Signature: ->(n, 2)
+        """
+        return self.interp.T.copy()
+
+    @property
+    @memoized
+    def diff(self):
+        """Return the elementwise differentiation matrix.
+
+        Signature: ->(n, n)
+        """
+        VrT = []
+        for row in np.eye(self.nnodes):
+            deriv = ortholegder(row)
+            VrT.append(ortholegval(self.ref_nodes, deriv))
+        Vr = np.vstack(VrT).T
+        return Vr @ la.inv(self.vdm)
+
+    @property
+    @memoized
+    def stiffness(self):
+        """Return the stiffness matrix.
+
+        Signature: ->(n, n)
+        """
+        return (self._ref_mass @ self.diff)
+
+    @property
+    @memoized
+    def interp(self):
+        """Return the volume-to-face interpolation matrix.
+
+        Signature: ->(2, n)
+        """
+        return ortholegvander([-1, 1], self.nnodes - 1) @ la.inv(self.vdm)
+
+    @property
+    @memoized
+    def elements(self):
+        """Return the list of elements, each given by their left/right boundaries.
+
+        Signature: ->(m, 2)
+        """
+        h = (self.right - self.left) / self.nelements
+        return np.array(list(zip(
+            np.linspace(self.left, self.right, self.nelements, endpoint=False),
+            np.linspace(h + self.left, self.right, self.nelements))))
+
+    @property
+    def dg_ops(self):
+        """Return a context manager yielding a DGOps1D instance.
+        """
+        return contextlib.contextmanager(lambda: (yield DGOps1DRef(self)))
+
+    @property
+    def normals(self):
+        """Return the face unit normals.
+
+        Signature: ->(m, 2)
+        """
+        result = np.zeros((self.nelements, 2))
+        result[:,0] = -1
+        result[:,1] = 1
+        return result
+
+
+def interpolate(discr, vec, nodes):
+    """Return an interpolated solution at *nodes*.
+
+    Input:
+        discr - a DGDiscr1D instance
+        vec - vector of nodal values at degrees of freedom
+        nodes - vector of nodes to interpolate to
+
+    Signature:  (m*n,) -> (len(nodes),)
+    """
+    elements = discr.elements
+    nelements = discr.nelements
+    nnodes = discr.nnodes
+    inv_vdm = la.inv(discr.vdm)
+
+    sorter = np.argsort(nodes)
+    sorted_nodes = nodes[sorter]
+    result = []
+
+    indices = np.searchsorted(sorted_nodes, elements)
+    for i, (start, end) in enumerate(indices):
+        if i == 0:
+            start = 0
+        elif i == nelements - 1:
+            end = len(nodes)
+
+        center = (elements[i][0] + elements[i][1]) / 2
+        radius = (elements[i][1] - elements[i][0]) / 2
+        element_nodes = sorted_nodes[start:end]
+        mapped_nodes = (element_nodes - center) / radius
+
+        modal_vals = inv_vdm @ vec[i * nnodes:(i + 1) * nnodes]
+        nodal_vals = ortholegvander(mapped_nodes, nnodes - 1) @ modal_vals
+        result.append(nodal_vals)
+
+    result = np.hstack(result)
+    unsorter = np.arange(len(nodes))[sorter]
+    return result[unsorter]
+
+
+def integrate(discr, soln):
+    """Return the integral of the solution.
+
+    Signature: (n*m,) -> ()
+    """
+    soln = soln.reshape((discr.nelements, discr.nnodes))
+    h = discr.elements[0][1] - discr.elements[0][0]
+    weights = discr.ref_weights * h / 2
+    return np.sum(soln * weights)
+
+
+def elementwise(mat, vec):
+    """Apply a matrix to rows of the input representing per-element
+    degrees of freedom.
+
+    Inputs:
+        mat: Shape (a, b)
+        vec: Shape (c, b)
+
+    Signature: (a, b), (c, b) -> (c, a)
+    """
+    return np.einsum("ij,kj->ki", mat, vec)
+
+
+class AbstractDGOps1D(object):
+    def __init__(self, discr):
+        self.discr = discr
+
+    @property
+    def array_ops(self):
+        raise NotImplementedError
+
+    @property
+    def normals(self):
+        """Return the vector of normals at the faces.
+
+        Signature: ->(m, 2)
+        """
+        raise NotImplementedError
+
+    def interp(self, vec):
+        """Apply elementwise volume-to-face interpolation.
+
+        Signature: (m, n) -> (m, 2)
+        """
+        raise NotImplementedError
+
+    def inv_mass(self, vec):
+        """Apply the elementwise inverse mass matrix.
+
+        Signature: (m, n) -> (m, n)
+        """
+        raise NotImplementedError
+
+    def stiffness(self, vec):
+        """Apply the elementwise stiffness matrix.
+
+        Signature: (m, n) -> (m, n)
+        """
+        raise NotImplementedError
+
+    def face_mass(self, vec):
+        """Apply the elementwise face mass matrix.
+
+        Signature: (m, 2) -> (m, n)
+        """
+        raise NotImplementedError
+
+    def face_swap(self, vec):
+        """Swap values at opposite faces.
+
+        Signature: (m, 2) -> (m, 2)
+        """
+        raise NotImplementedError
+
+
+def elementwise(mat, vec):
+    """Apply a matrix to rows of the input representing per-element
+    degrees of freedom.
+
+    Inputs:
+        mat: Shape (a, b)
+        vec: Shape (c, b)
+
+    Signature: (a, b), (c, b) -> (c, a)
+    """
+    return np.einsum("ij,kj->ki", mat, vec)
+
+
+class DGOps1DRef(AbstractDGOps1D):
+    """A reference NumPy implementation of the AbstractDGOps1D interface."""
+
+    @AbstractDGOps1D.array_ops.getter
+    def array_ops(self):
+        return np
+
+    @AbstractDGOps1D.normals.getter
+    def normals(self):
+        return self.discr.normals
+
+    def interp(self, vec):
+        return elementwise(self.discr.interp, vec)
+
+    def inv_mass(self, vec):
+        return elementwise(self.discr.inv_mass, vec)
+
+    def stiffness(self, vec):
+        return elementwise(self.discr.stiffness, vec)
+
+    def face_mass(self, vec):
+        return elementwise(self.discr.face_mass, vec)
+
+    def face_swap(self, vec):
+        result = np.zeros_like(vec)
+        result[:, 0] = np.roll(vec[:, 1], +1)
+        result[:, 1] = np.roll(vec[:, 0], -1)
+        return result
+
+
+def matrix_getter(name, shape):
+
+    def getter(self):
+        data = getattr(self.discr, name)
+        return pt.make_data_wrapper(self.namespace, data, name=name, shape=shape)
+
+    return property(memoized(getter))
+
+
+class DGOps1D(AbstractDGOps1D):
+
+    @AbstractDGOps1D.array_ops.getter
+    def array_ops(self):
+        return pt
+
+    def __init__(self, discr, namespace):
+        self.discr = discr
+        self.namespace = namespace
+
+    _normals = matrix_getter("normals", "(nelements, 2)")
+    _interp_mat = matrix_getter("interp", "(2, nnodes)")
+    _inv_mass_mat = matrix_getter("inv_mass", "(nnodes, nnodes)")
+    _stiffness_mat = matrix_getter("stiffness", "(nnodes, nnodes)")
+    _face_mass_mat = matrix_getter("face_mass", "(nnodes, 2)")
+
+    @AbstractDGOps1D.normals.getter
+    def normals(self):
+        return self._normals
+
+    def interp(self, vec):
+        return pt.matmul(self._interp_mat, vec.T).T
+
+    def inv_mass(self, vec):
+        return pt.matmul(self._inv_mass_mat, vec.T).T
+
+    def stiffness(self, vec):
+        return pt.matmul(self._stiffness_mat, vec.T).T
+
+    def face_mass(self, vec):
+        return pt.matmul(self._face_mass_mat, vec.T).T
+
+    def face_swap(self, vec):
+        return pt.stack(
+                (
+                    pt.roll(vec[:, 1], +1),
+                    pt.roll(vec[:, 0], -1)),
+                axis=1)
diff --git a/pytato/__init__.py b/pytato/__init__.py
index 123041a3844465c966d95e91840c27307f225b0a..8eab25081837165f31abeeeefe8bdf3008fdde8e 100644
--- a/pytato/__init__.py
+++ b/pytato/__init__.py
@@ -26,7 +26,12 @@ THE SOFTWARE.
 
 from pytato.array import (
         Namespace, Array, DictOfNamedArrays, Tag, UniqueTag,
-        DottedName, Placeholder, make_placeholder,
+        DottedName, Placeholder, IndexLambda,
+
+        make_dict_of_named_arrays,
+        make_placeholder, make_size_param, make_data_wrapper,
+
+        matmul, roll, transpose, stack,
         )
 
 from pytato.codegen import generate_loopy
@@ -34,8 +39,14 @@ from pytato.target import Target, PyOpenCLTarget
 
 __all__ = (
         "DottedName", "Namespace", "Array", "DictOfNamedArrays",
-        "Tag", "UniqueTag", "Placeholder", "make_placeholder",
+        "Tag", "UniqueTag", "Placeholder", "IndexLambda",
+
+        "make_dict_of_named_arrays", "make_placeholder", "make_size_param",
+        "make_data_wrapper",
+
+        "matmul", "roll", "transpose", "stack",
 
         "generate_loopy",
+
         "Target", "PyOpenCLTarget",
 )
diff --git a/pytato/array.py b/pytato/array.py
index e9bea177a3b664d41670818de7612d9743782a35..42070f5098e21facd7ddf7d550771cb6f84f4ee1 100644
--- a/pytato/array.py
+++ b/pytato/array.py
@@ -46,6 +46,17 @@ Array Interface
 .. autoclass:: UniqueTag
 .. autoclass:: DictOfNamedArrays
 
+NumPy-Like Interface
+--------------------
+
+These functions generally follow the interface of the corresponding functions in
+:mod:`numpy`, but not all NumPy features may be supported.
+
+.. autofunction:: matmul
+.. autofunction:: roll
+.. autofunction:: transpose
+.. autofunction:: stack
+
 Supporting Functionality
 ------------------------
 
@@ -55,6 +66,7 @@ Supporting Functionality
 
 Concrete Array Data
 -------------------
+
 .. autoclass:: DataInterface
 
 Pre-Defined Tags
@@ -71,10 +83,26 @@ Built-in Expression Nodes
 
 .. autoclass:: IndexLambda
 .. autoclass:: Einsum
+.. autoclass:: MatrixProduct
+.. autoclass:: LoopyFunction
+.. autoclass:: Stack
+
+Index Remapping
+^^^^^^^^^^^^^^^
+
+.. autoclass:: IndexRemappingBase
+.. autoclass:: Roll
+.. autoclass:: AxisPermutation
 .. autoclass:: Reshape
+.. autoclass:: Slice
+
+Input Arguments
+^^^^^^^^^^^^^^^
+
+.. autoclass:: InputArgumentBase
 .. autoclass:: DataWrapper
 .. autoclass:: Placeholder
-.. autoclass:: LoopyFunction
+.. autoclass:: SizeParam
 
 User-Facing Node Creation
 -------------------------
@@ -88,6 +116,8 @@ Node constructors such as :class:`Placeholder.__init__` and
 
 .. autofunction:: make_dict_of_named_arrays
 .. autofunction:: make_placeholder
+.. autofunction:: make_size_param
+.. autofunction:: make_data_wrapper
 """
 
 # }}}
@@ -97,17 +127,31 @@ from numbers import Number
 import operator
 from dataclasses import dataclass
 from typing import (
-        Optional, ClassVar, Dict, Any, Mapping, Iterator, Tuple, Union,
-        FrozenSet, Protocol)
+        Optional, Callable, ClassVar, Dict, Any, Mapping, Iterator, Tuple, Union,
+        FrozenSet, Protocol, Sequence, cast, TYPE_CHECKING)
 
 import numpy as np
 import pymbolic.primitives as prim
-from pytools import is_single_valued, memoize_method
+from pymbolic import var
+from pytools import is_single_valued, memoize_method, UniqueNameGenerator
 
 import pytato.scalar_expr as scalar_expr
 from pytato.scalar_expr import ScalarExpression
 
 
+# Get a type variable that represents the type of '...'
+# https://github.com/python/typing/issues/684#issuecomment-548203158
+if TYPE_CHECKING:
+    from enum import Enum
+
+    class EllipsisType(Enum):
+        Ellipsis = "..."
+
+    Ellipsis = EllipsisType.Ellipsis
+else:
+    EllipsisType = type(Ellipsis)
+
+
 # {{{ dotted name
 
 class DottedName:
@@ -131,7 +175,7 @@ class DottedName:
             raise ValueError("empty name parts")
 
         for p in name_parts:
-            if not str.isidentifier(p):
+            if not p.isidentifier():
                 raise ValueError(f"{p} is not a Python identifier")
 
         self.name_parts = name_parts
@@ -159,6 +203,7 @@ class Namespace(Mapping[str, "Array"]):
     may not be used.  (:class:`Placeholder` instances register their names in
     this way to avoid ambiguity.)
 
+    .. attribute:: name_gen
     .. automethod:: __contains__
     .. automethod:: __getitem__
     .. automethod:: __iter__
@@ -167,17 +212,17 @@ class Namespace(Mapping[str, "Array"]):
     .. automethod:: copy
     .. automethod:: ref
     """
+    name_gen: UniqueNameGenerator
 
     def __init__(self) -> None:
         self._symbol_table: Dict[str, Array] = {}
+        self.name_gen = UniqueNameGenerator()
 
     def __contains__(self, name: object) -> bool:
         return name in self._symbol_table
 
     def __getitem__(self, name: str) -> Array:
         item = self._symbol_table[name]
-        if item is None:
-            raise ValueError("cannot access a reserved name")
         return item
 
     def __iter__(self) -> Iterator[str]:
@@ -187,7 +232,8 @@ class Namespace(Mapping[str, "Array"]):
         return len(self._symbol_table)
 
     def copy(self) -> Namespace:
-        raise NotImplementedError
+        from pytato.transform import CopyMapper, copy_namespace
+        return copy_namespace(self, CopyMapper(Namespace()))
 
     def assign(self, name: str, value: Array) -> str:
         """Declare a new array.
@@ -199,6 +245,8 @@ class Namespace(Mapping[str, "Array"]):
         """
         if name in self._symbol_table:
             raise ValueError(f"'{name}' is already assigned")
+        if not self.name_gen.is_name_conflicting(name):
+            self.name_gen.add_name(name)
         self._symbol_table[name] = value
 
         return name
@@ -277,7 +325,7 @@ ConvertibleToShape = Union[
 
 
 def _check_identifier(s: str, ns: Optional[Namespace] = None) -> bool:
-    if not str.isidentifier(s):
+    if not s.isidentifier():
         raise ValueError(f"'{s}' is not a valid identifier")
 
     if ns is not None:
@@ -334,6 +382,19 @@ def normalize_shape(
 
 # {{{ array inteface
 
+SliceItem = Union[int, slice, None, EllipsisType]
+DtypeOrScalar = Union[np.dtype, Number]
+
+
+def _truediv_result_type(arg1: DtypeOrScalar, arg2: DtypeOrScalar) -> np.dtype:
+    dtype = np.result_type(arg1, arg2)
+    # See: test_true_divide in numpy/core/tests/test_ufunc.py
+    if dtype.kind in "iu":
+        return np.float64
+    else:
+        return dtype
+
+
 class Array:
     """
     A base class (abstract interface + supplemental functionality) for lazily
@@ -352,7 +413,7 @@ class Array:
 
     .. attribute:: namespace
 
-        A (mutable) instance of :class:`Namespace` containing the
+        A (mutable) instance of :class:`~pytato.Namespace` containing the
         names used in the computation. All arrays in a
         computation share the same namespace.
 
@@ -388,28 +449,45 @@ class Array:
     .. automethod:: tagged
     .. automethod:: without_tag
 
+    Array interface:
+
+    .. automethod:: __getitem__
+    .. attribute:: T
+
+    .. method:: __mul__
+    .. method:: __rmul__
+    .. method:: __add__
+    .. method:: __radd__
+    .. method:: __sub__
+    .. method:: __rsub__
+    .. method:: __truediv__
+    .. method:: __rtruediv__
+    .. method:: __neg__
+    .. method:: __pos__
+
     Derived attributes:
 
     .. attribute:: ndim
 
     """
-    mapper_method: ClassVar[str]
+    _mapper_method: ClassVar[str]
     # A tuple of field names. Fields must be equality comparable and
     # hashable. Dicts of hashable keys and values are also permitted.
-    fields: ClassVar[Tuple[str, ...]] = ("shape", "dtype", "tags")
+    _fields: ClassVar[Tuple[str, ...]] = ("shape", "dtype", "tags")
 
-    def __init__(self,
-            namespace: Namespace,
-            tags: Optional[TagsType] = None):
+    def __init__(self, tags: Optional[TagsType] = None):
         if tags is None:
             tags = frozenset()
 
-        self.namespace = namespace
         self.tags = tags
 
     def copy(self, **kwargs: Any) -> Array:
         raise NotImplementedError
 
+    @property
+    def namespace(self) -> Namespace:
+        raise NotImplementedError
+
     @property
     def shape(self) -> ShapeType:
         raise NotImplementedError
@@ -421,10 +499,97 @@ class Array:
     def named(self, name: str) -> Array:
         return self.namespace.ref(self.namespace.assign(name, self))
 
+    def __getitem__(self,
+            slice_spec: Union[SliceItem, Tuple[SliceItem, ...]]) -> Array:
+        if not isinstance(slice_spec, tuple):
+            slice_spec = (slice_spec,)
+
+        # FIXME: This doesn't support all NumPy basic indexing constructs,
+        # including:
+        #
+        # - newaxis
+        # - Ellipsis
+        # - slices with nontrivial strides
+        # - slices with bounds that exceed array dimensions
+        # - slices with negative indices
+        # - slices that yield an array with a zero dimension (lacks codegen support)
+        #
+        # Symbolic expression support is also limited.
+
+        starts = []
+        sizes = []
+        kept_dims = []
+
+        slice_spec_expanded = []
+
+        for elem in slice_spec:
+            if elem is Ellipsis:
+                raise NotImplementedError("'...' is unsupported")
+            elif elem is None:
+                raise NotImplementedError("newaxis is unsupported")
+            else:
+                assert isinstance(elem, (int, slice))
+                slice_spec_expanded.append(elem)
+
+        slice_spec_expanded.extend(
+                [slice(None, None, None)] * (self.ndim - len(slice_spec)))
+
+        if len(slice_spec_expanded) > self.ndim:
+            raise ValueError("too many dimensions in index")
+
+        for i, elem in enumerate(slice_spec_expanded):
+            if isinstance(elem, slice):
+                start = elem.start
+                if start is None:
+                    start = 0
+                stop = elem.stop
+                if stop is None:
+                    stop = self.shape[i]
+                stride = elem.step
+                if stride is not None and stride != 1:
+                    raise ValueError("non-trivial strides unsupported")
+                starts.append(start)
+                sizes.append(stop - start)
+                kept_dims.append(i)
+
+            elif isinstance(elem, int):
+                starts.append(elem)
+                sizes.append(1)
+
+            else:
+                raise ValueError("unknown index along dimension")
+
+        slice_ = _make_slice(self, starts, sizes)
+
+        if len(kept_dims) != self.ndim:
+            # Return an IndexLambda that elides the indexed-into dimensions
+            # (as opposed to the ones that were sliced).
+            indices = [0] * self.ndim
+            shape = []
+            for i, dim in enumerate(kept_dims):
+                indices[dim] = var(f"_{i}")
+                shape.append(slice_.shape[dim])
+            expr = var("_in0")
+            if indices:
+                expr = expr[tuple(indices)]
+
+            # FIXME: Flatten into a single IndexLambda
+            return IndexLambda(self.namespace,
+                    expr,
+                    shape=tuple(shape),
+                    dtype=self.dtype,
+                    bindings=dict(_in0=slice_))
+        else:
+            return slice_
+
     @property
     def ndim(self) -> int:
         return len(self.shape)
 
+    @property
+    def T(self) -> Array:
+        return AxisPermutation(self, tuple(range(self.ndim)[::-1]))
+
     def tagged(self, tag: Tag) -> Array:
         """
         Returns a copy of *self* tagged with *tag*.
@@ -445,7 +610,7 @@ class Array:
     @memoize_method
     def __hash__(self) -> int:
         attrs = []
-        for field in self.fields:
+        for field in self._fields:
             attr = getattr(self, field)
             if isinstance(attr, dict):
                 attr = frozenset(attr.items())
@@ -460,50 +625,98 @@ class Array:
                 and self.namespace is other.namespace
                 and all(
                     getattr(self, field) == getattr(other, field)
-                    for field in self.fields))
+                    for field in self._fields))
 
     def __ne__(self, other: Any) -> bool:
         return not self.__eq__(other)
 
+    def __matmul__(self, other: Array, reverse: bool = False) -> Array:
+        first = self
+        second = other
+        if reverse:
+            first, second = second, first
+        return matmul(first, second)
+
+    __rmatmul__ = partialmethod(__matmul__, reverse=True)
+
     def _binary_op(self,
             op: Any,
             other: Union[Array, Number],
+            get_result_type: Callable[[DtypeOrScalar, DtypeOrScalar], np.dtype] = np.result_type,  # noqa
             reverse: bool = False) -> Array:
+
+        def add_indices(val: prim.Expression) -> prim.Expression:
+            if self.ndim == 0:
+                return val
+            else:
+                indices = tuple(var(f"_{i}") for i in range(self.ndim))
+                return val[indices]
+
         if isinstance(other, Number):
-            # TODO
-            raise NotImplementedError
-        else:
+            first_expr = add_indices(var("_in0"))
+            second_expr = other
+            bindings = dict(_in0=self)
+            dtype = get_result_type(self.dtype, other)
+
+        elif isinstance(other, Array):
             if self.shape != other.shape:
-                raise NotImplementedError("broadcasting")
+                raise NotImplementedError("broadcasting not supported")
+            first_expr = add_indices(var("_in0"))
+            second_expr = add_indices(var("_in1"))
+            bindings = dict(_in0=self, _in1=other)
+            dtype = get_result_type(self.dtype, other.dtype)
 
-            dtype = np.result_type(self.dtype, other.dtype)
+        else:
+            raise ValueError("unknown argument")
 
-            # FIXME: If either *self* or *other* is an IndexLambda, its expression
-            # could be folded into the output, producing a fused result.
-            if self.shape == ():
-                expr = op(prim.Variable("_in0"), prim.Variable("_in1"))
-            else:
-                indices = tuple(
-                        prim.Variable(f"_{i}") for i in range(self.ndim))
-                expr = op(
-                        prim.Variable("_in0")[indices],
-                        prim.Variable("_in1")[indices])
+        if reverse:
+            first_expr, second_expr = second_expr, first_expr
 
-            first, second = self, other
-            if reverse:
-                first, second = second, first
+        expr = op(first_expr, second_expr)
 
-            bindings = dict(_in0=first, _in1=second)
+        return IndexLambda(self.namespace,
+                expr,
+                shape=self.shape,
+                dtype=dtype,
+                bindings=bindings)
 
-            return IndexLambda(self.namespace,
-                    expr,
-                    shape=self.shape,
-                    dtype=dtype,
-                    bindings=bindings)
+    def _unary_op(self, op: Any) -> Array:
+        if self.ndim == 0:
+            expr = op(var("_in0"))
+        else:
+            indices = tuple(var(f"_{i}") for i in range(self.ndim))
+            expr = op(var("_in0")[indices])
+
+        bindings = dict(_in0=self)
+        return IndexLambda(self.namespace,
+                expr,
+                shape=self.shape,
+                dtype=self.dtype,
+                bindings=bindings)
 
     __mul__ = partialmethod(_binary_op, operator.mul)
     __rmul__ = partialmethod(_binary_op, operator.mul, reverse=True)
 
+    __add__ = partialmethod(_binary_op, operator.add)
+    __radd__ = partialmethod(_binary_op, operator.add, reverse=True)
+
+    __sub__ = partialmethod(_binary_op, operator.sub)
+    __rsub__ = partialmethod(_binary_op, operator.sub, reverse=True)
+
+    __truediv__ = partialmethod(_binary_op, operator.truediv,
+            get_result_type=_truediv_result_type)
+    __rtruediv__ = partialmethod(_binary_op, operator.truediv,
+            get_result_type=_truediv_result_type, reverse=True)
+
+    __neg__ = partialmethod(_unary_op, operator.neg)
+
+    def __pos__(self) -> Array:
+        return self
+
+# }}}
+
+
+# {{{ mixins
 
 class _SuppliedShapeAndDtypeMixin(object):
     """A mixin class for when an array must store its own *shape* and *dtype*,
@@ -511,12 +724,11 @@ class _SuppliedShapeAndDtypeMixin(object):
     """
 
     def __init__(self,
-            namespace: Namespace,
             shape: ShapeType,
             dtype: np.dtype,
             **kwargs: Any):
         # https://github.com/python/mypy/issues/5887
-        super().__init__(namespace, **kwargs)  # type: ignore
+        super().__init__(**kwargs)  # type: ignore
         self._shape = shape
         self._dtype = dtype
 
@@ -644,8 +856,9 @@ class IndexLambda(_SuppliedShapeAndDtypeMixin, Array):
 
     .. automethod:: is_reference
     """
-    fields = Array.fields + ("expr", "bindings")
-    mapper_method = "map_index_lambda"
+
+    _fields = Array._fields + ("expr", "bindings")
+    _mapper_method = "map_index_lambda"
 
     def __init__(self,
             namespace: Namespace,
@@ -658,11 +871,16 @@ class IndexLambda(_SuppliedShapeAndDtypeMixin, Array):
         if bindings is None:
             bindings = {}
 
-        super().__init__(namespace, shape=shape, dtype=dtype, tags=tags)
+        super().__init__(shape=shape, dtype=dtype, tags=tags)
 
+        self._namespace = namespace
         self.expr = expr
         self.bindings = bindings
 
+    @property
+    def namespace(self) -> Namespace:
+        return self._namespace
+
     @memoize_method
     def is_reference(self) -> bool:
         # FIXME: Do we want a specific 'reference' node to make all this
@@ -678,7 +896,7 @@ class IndexLambda(_SuppliedShapeAndDtypeMixin, Array):
         else:
             return False
 
-        if index != tuple("_%d" % i for i in range(len(self.shape))):
+        if index != tuple(var("_%d") % i for i in range(len(self.shape))):
             return False
 
         try:
@@ -706,12 +924,282 @@ class Einsum(Array):
 # }}}
 
 
+# {{{ matrix product
+
+class MatrixProduct(Array):
+    """A product of two matrices, or a matrix and a vector.
+
+    The semantics of this operation follow PEP 465 [pep465]_, i.e., the Python
+    matmul (@) operator.
+
+    .. attribute:: x1
+    .. attribute:: x2
+
+    .. [pep465] https://www.python.org/dev/peps/pep-0465/
+
+    """
+    _fields = Array._fields + ("x1", "x2")
+
+    _mapper_method = "map_matrix_product"
+
+    def __init__(self,
+            x1: Array,
+            x2: Array,
+            tags: Optional[TagsType] = None):
+        super().__init__(tags)
+        self.x1 = x1
+        self.x2 = x2
+
+    @property
+    def namespace(self) -> Namespace:
+        return self.x1.namespace
+
+    @property
+    def shape(self) -> ShapeType:
+        # FIXME: Broadcasting currently unsupported.
+        assert 0 < self.x1.ndim <= 2
+        assert 0 < self.x2.ndim <= 2
+
+        if self.x1.ndim == 1 and self.x2.ndim == 1:
+            return ()
+        elif self.x1.ndim == 1 and self.x2.ndim == 2:
+            return (self.x2.shape[1],)
+        elif self.x1.ndim == 2 and self.x2.ndim == 1:
+            return (self.x1.shape[0],)
+        elif self.x1.ndim == 2 and self.x2.ndim == 2:
+            return (self.x1.shape[0], self.x2.shape[1])
+
+        assert False
+
+    @property
+    def dtype(self) -> np.dtype:
+        return np.result_type(self.x1.dtype, self.x2.dtype)
+
+# }}}
+
+
+# {{{ stack
+
+class Stack(Array):
+    """Join a sequence of arrays along an axis.
+
+    .. attribute:: arrays
+
+        The sequence of arrays to join
+
+    .. attribute:: axis
+
+        The output axis
+
+    """
+
+    _fields = Array._fields + ("arrays", "axis")
+    _mapper_method = "map_stack"
+
+    def __init__(self,
+            arrays: Tuple[Array, ...],
+            axis: int,
+            tags: Optional[TagsType] = None):
+        super().__init__(tags)
+        self.arrays = arrays
+        self.axis = axis
+
+    @property
+    def namespace(self) -> Namespace:
+        return self.arrays[0].namespace
+
+    @property
+    def dtype(self) -> np.dtype:
+        return np.result_type(*(arr.dtype for arr in self.arrays))
+
+    @property
+    def shape(self) -> ShapeType:
+        result = list(self.arrays[0].shape)
+        result.insert(self.axis, len(self.arrays))
+        return tuple(result)
+
+# }}}
+
+
+# {{{ index remapping
+
+class IndexRemappingBase(Array):
+    """Base class for operations that remap the indices of an array.
+
+    Note that index remappings can also be expressed via
+    :class:`~pytato.IndexLambda`.
+
+    .. attribute:: array
+
+        The input :class:`~pytato.Array`
+
+    """
+    _fields = Array._fields + ("array",)
+
+    def __init__(self,
+            array: Array,
+            tags: Optional[TagsType] = None):
+        super().__init__(tags)
+        self.array = array
+
+    @property
+    def dtype(self) -> np.dtype:
+        return self.array.dtype
+
+    @property
+    def namespace(self) -> Namespace:
+        return self.array.namespace
+
+# }}}
+
+
+# {{{ roll
+
+class Roll(IndexRemappingBase):
+    """Roll an array along an axis.
+
+    .. attribute:: shift
+
+        Shift amount.
+
+    .. attribute:: axis
+
+        Shift axis.
+    """
+    _fields = IndexRemappingBase._fields + ("shift", "axis")
+    _mapper_method = "map_roll"
+
+    def __init__(self,
+            array: Array,
+            shift: int,
+            axis: int,
+            tags: Optional[TagsType] = None):
+        super().__init__(array, tags)
+        self.shift = shift
+        self.axis = axis
+
+    @property
+    def shape(self) -> ShapeType:
+        return self.array.shape
+
+# }}}
+
+
+# {{{ axis permutation
+
+class AxisPermutation(IndexRemappingBase):
+    r"""Permute the axes of an array.
+
+    .. attribute:: axes
+
+        A permutation of the input axes.
+    """
+    _fields = IndexRemappingBase._fields + ("axes",)
+    _mapper_method = "map_axis_permutation"
+
+    def __init__(self,
+            array: Array,
+            axes: Tuple[int, ...],
+            tags: Optional[TagsType] = None):
+        super().__init__(array, tags)
+        self.array = array
+        self.axes = axes
+
+    @property
+    def shape(self) -> ShapeType:
+        result = []
+        base_shape = self.array.shape
+        for index in self.axes:
+            result.append(base_shape[index])
+        return tuple(result)
+
+# }}}
+
+
 # {{{ reshape
 
-class Reshape(Array):
+class Reshape(IndexRemappingBase):
+    """
+    """
+
+# }}}
+
+
+# {{{ slice
+
+class Slice(IndexRemappingBase):
+    """Extracts a slice of constant size from an array.
+
+    .. attribute:: begin
+    .. attribute:: size
     """
+    _fields = IndexRemappingBase._fields + ("begin", "size")
+    _mapper_method = "map_slice"
+
+    def __init__(self,
+            array: Array,
+            begin: Tuple[int, ...],
+            size: Tuple[int, ...],
+            tags: Optional[TagsType] = None):
+        super().__init__(array, tags)
+
+        self.begin = begin
+        self.size = size
+
+    @property
+    def shape(self) -> ShapeType:
+        return self.size
+
+# }}}
+
+
+# {{{ base class for arguments
+
+class InputArgumentBase(Array):
+    r"""Base class for input arguments.
+
+    .. attribute:: name
+
+        The name by which a value is supplied for the argument once computation
+        begins.
+
+        The name is also implicitly :meth:`~pytato.Namespace.assign`\ ed in the
+        :class:`~pytato.Namespace`.
+
+    .. note::
+
+        Creating multiple instances of any input argument with the same name
+        and within the same :class:`~pytato.Namespace` is not allowed.
     """
 
+    # The name uniquely identifies this object in the namespace. Therefore,
+    # subclasses don't have to update *_fields*.
+    _fields = ("name",)
+
+    def __init__(self,
+            namespace: Namespace,
+            name: str,
+            tags: Optional[TagsType] = None):
+        if name is None:
+            raise ValueError("Must have explicit name")
+
+        # Publish our name to the namespace.
+        namespace.assign(name, self)
+
+        self._namespace = namespace
+        super().__init__(tags=tags)
+        self.name = name
+
+    @property
+    def namespace(self) -> Namespace:
+        return self._namespace
+
+    def tagged(self, tag: Tag) -> Array:
+        raise ValueError("Cannot modify tags")
+
+    def without_tag(self, tag: Tag, verify_existence: bool = True) -> Array:
+        raise ValueError("Cannot modify tags")
+
 # }}}
 
 
@@ -729,15 +1217,14 @@ class DataInterface(Protocol):
     .. attribute:: shape
     .. attribute:: dtype
     """
+
     shape: ShapeType
     dtype: np.dtype
 
 
-class DataWrapper(Array):
-    # TODO: Name?
-    """
-    Takes concrete array data and packages it to be compatible
-    with the :class:`Array` interface.
+class DataWrapper(InputArgumentBase):
+    """Takes concrete array data and packages it to be compatible with the
+    :class:`Array` interface.
 
     .. attribute:: data
 
@@ -751,16 +1238,22 @@ class DataWrapper(Array):
         this array may not be updated in-place.
     """
 
+    _mapper_method = "map_data_wrapper"
+
     def __init__(self,
             namespace: Namespace,
+            name: str,
             data: DataInterface,
+            shape: ShapeType,
             tags: Optional[TagsType] = None):
-        super().__init__(namespace, tags=tags)
+        super().__init__(namespace, name, tags=tags)
+
         self.data = data
+        self._shape = shape
 
     @property
     def shape(self) -> ShapeType:
-        return self.data.shape
+        return self._shape
 
     @property
     def dtype(self) -> np.dtype:
@@ -771,25 +1264,12 @@ class DataWrapper(Array):
 
 # {{{ placeholder
 
-class Placeholder(_SuppliedShapeAndDtypeMixin, Array):
-    r"""
-    A named placeholder for an array whose concrete value
-    is supplied by the user during evaluation.
-
-    .. attribute:: name
-
-        The name by which a value is supplied
-        for the placeholder once computation begins.
-        The name is also implicitly :meth:`~Namespace.assign`\ ed
-        in the :class:`Namespace`.
-
-    .. note::
-
-        Creating multiple instances of a :class:`Placeholder` with the same name
-        and within the same :class:`Namespace` is not allowed.
+class Placeholder(_SuppliedShapeAndDtypeMixin, InputArgumentBase):
+    r"""A named placeholder for an array whose concrete value is supplied by the
+    user during evaluation.
     """
-    mapper_method = "map_placeholder"
-    fields = Array.fields + ("name",)
+
+    _mapper_method = "map_placeholder"
 
     def __init__(self,
             namespace: Namespace,
@@ -797,24 +1277,31 @@ class Placeholder(_SuppliedShapeAndDtypeMixin, Array):
             shape: ShapeType,
             dtype: np.dtype,
             tags: Optional[TagsType] = None):
-        if name is None:
-            raise ValueError("Must have explicit name")
-
-        # Publish our name to the namespace
-        namespace.assign(name, self)
-
-        super().__init__(namespace=namespace,
-                shape=shape,
+        super().__init__(shape=shape,
                 dtype=dtype,
+                namespace=namespace,
+                name=name,
                 tags=tags)
 
-        self.name = name
+# }}}
 
-    def tagged(self, tag: Tag) -> Array:
-        raise ValueError("Cannot modify tags")
 
-    def without_tag(self, tag: Tag, verify_existence: bool = True) -> Array:
-        raise ValueError("Cannot modify tags")
+# {{{ size parameter
+
+class SizeParam(InputArgumentBase):
+    r"""A named placeholder for a scalar that may be used as a variable in symbolic
+    expressions for array sizes.
+    """
+
+    _mapper_method = "map_size_param"
+
+    @property
+    def shape(self) -> ShapeType:
+        return ()
+
+    @property
+    def dtype(self) -> np.dtype:
+        return np.dtype(np.intp)
 
 # }}}
 
@@ -830,11 +1317,155 @@ class LoopyFunction(DictOfNamedArrays):
         name.
     """
 
-
 # }}}
 
 
-# {{{ end-user-facing
+# {{{ end-user facing
+
+def matmul(x1: Array, x2: Array) -> Array:
+    """Matrix multiplication.
+
+    :param x1: first argument
+    :param x2: second argument
+    """
+    if (
+            isinstance(x1, Number)
+            or x1.shape == ()
+            or isinstance(x2, Number)
+            or x2.shape == ()):
+        raise ValueError("scalars not allowed as arguments to matmul")
+
+    if x1.namespace is not x2.namespace:
+        raise ValueError("namespace mismatch")
+
+    if len(x1.shape) > 2 or len(x2.shape) > 2:
+        raise NotImplementedError("broadcasting not supported")
+
+    if x1.shape[-1] != x2.shape[0]:
+        raise ValueError("dimension mismatch")
+
+    return MatrixProduct(x1, x2)
+
+
+def roll(a: Array, shift: int, axis: Optional[int] = None) -> Array:
+    """Roll array elements along a given axis.
+
+    :param a: input array
+    :param shift: the number of places by which elements are shifted
+    :param axis: axis along which the array is shifted
+    """
+    if a.ndim == 0:
+        return a
+
+    if axis is None:
+        if a.ndim > 1:
+            raise NotImplementedError(
+                    "shifing along more than one dimension is unsupported")
+        else:
+            axis = 0
+
+    if not (0 <= axis < a.ndim):
+        raise ValueError("invalid axis")
+
+    if shift == 0:
+        return a
+
+    return Roll(a, shift, axis)
+
+
+def transpose(a: Array, axes: Optional[Sequence[int]] = None) -> Array:
+    """Reverse or permute the axes of an array.
+
+    :param a: input array
+    :param axes: if specified, a permutation of ``[0, 1, ..., a.ndim-1]``. Defaults
+        to ``range(a.ndim)[::-1]``. The returned axis at index *i* corresponds to
+        the input axis *axes[i]*.
+    """
+    if axes is None:
+        axes = range(a.ndim)[::-1]
+
+    if len(axes) != a.ndim:
+        raise ValueError("axes have incorrect length")
+
+    if set(axes) != set(range(a.ndim)):
+        raise ValueError("repeated or out-of-bounds axes detected")
+
+    return AxisPermutation(a, tuple(axes))
+
+
+def stack(arrays: Sequence[Array], axis: int = 0) -> Array:
+    """Join a sequence of arrays along a new axis.
+
+    The *axis* parameter specifies the position of the new axis in the result.
+
+    Example::
+
+       >>> arrays = [pt.zeros(3)] * 4
+       >>> pt.stack(arrays, axis=0).shape
+       (4, 3)
+
+    :param arrays: a finite sequence, each of whose elements is an
+        :class:`Array` of the same shape
+    :param axis: the position of the new axis, which will have length
+        *len(arrays)*
+    """
+
+    if not arrays:
+        raise ValueError("need at least one array to stack")
+
+    for array in arrays[1:]:
+        if array.shape != arrays[0].shape:
+            raise ValueError("arrays must have the same shape")
+
+    if not (0 <= axis <= arrays[0].ndim):
+        raise ValueError("invalid axis")
+
+    return Stack(tuple(arrays), axis)
+
+
+def _make_slice(array: Array, begin: Sequence[int], size: Sequence[int]) -> Array:
+    """Extract a constant-sized slice from an array with constant offsets.
+
+    :param array: input array
+    :param begin: a sequence of length *array.ndim* containing slice
+        offsets. Must be in bounds.
+    :param size: a sequence of length *array.ndim* containing the sizes of
+        the slice along each dimension. Must be in bounds.
+
+    .. note::
+
+        Support for slices is currently limited. Among other things, non-trivial
+        slices (i.e., not the length of the whole dimension) involving symbolic
+        expressions are unsupported.
+    """
+    if array.ndim != len(begin):
+        raise ValueError("'begin' and 'array' do not match in number of dimensions")
+
+    if len(begin) != len(size):
+        raise ValueError("'begin' and 'size' do not match in number of dimensions")
+
+    for i, (bval, sval) in enumerate(zip(begin, size)):
+        symbolic_index = not (
+                isinstance(array.shape[i], int)
+                and isinstance(bval, int)
+                and isinstance(sval, int))
+
+        if symbolic_index:
+            if not (0 == bval and sval == array.shape[i]):
+                raise ValueError(
+                        "slicing with symbolic dimensions is unsupported")
+            continue
+
+        ubound: int = cast(int, array.shape[i])
+        if not (0 <= bval < ubound):
+            raise ValueError("index '%d' of 'begin' out of bounds" % i)
+
+        if sval < 0 or not (0 <= bval + sval <= ubound):
+            raise ValueError("index '%d' of 'size' out of bounds" % i)
+
+    # FIXME: Generate IndexLambda when possible
+    return Slice(array, tuple(begin), tuple(size))
+
 
 def make_dict_of_named_arrays(data: Dict[str, Array]) -> DictOfNamedArrays:
     """Make a :class:`DictOfNamedArrays` object and ensure that all arrays
@@ -851,7 +1482,7 @@ def make_dict_of_named_arrays(data: Dict[str, Array]) -> DictOfNamedArrays:
 def make_placeholder(namespace: Namespace,
         name: str,
         shape: ConvertibleToShape,
-        dtype: np.dtype,
+        dtype: Any,
         tags: Optional[TagsType] = None) -> Placeholder:
     """Make a :class:`Placeholder` object.
 
@@ -859,18 +1490,69 @@ def make_placeholder(namespace: Namespace,
     :param name:       name of the placeholder array
     :param shape:      shape of the placeholder array
     :param dtype:      dtype of the placeholder array
+                       (must be convertible to :class:`numpy.dtype`)
     :param tags:       implementation tags
     """
     if name is None:
         raise ValueError("Placeholder instances must have a name")
 
-    if not str.isidentifier(name):
+    if not name.isidentifier():
         raise ValueError(f"'{name}' is not a Python identifier")
 
     shape = normalize_shape(shape, namespace)
+    dtype = np.dtype(dtype)
 
     return Placeholder(namespace, name, shape, dtype, tags)
 
+
+def make_size_param(namespace: Namespace,
+        name: str,
+        tags: Optional[TagsType] = None) -> SizeParam:
+    """Make a :class:`SizeParam`.
+
+    Size parameters may be used as variables in symbolic expressions for array
+    sizes.
+
+    :param namespace:  namespace
+    :param name:       name
+    :param tags:       implementation tags
+    """
+    if name is None:
+        raise ValueError("SizeParam instances must have a name")
+
+    if not name.isidentifier():
+        raise ValueError(f"'{name}' is not a Python identifier")
+
+    return SizeParam(namespace, name, tags=tags)
+
+
+def make_data_wrapper(namespace: Namespace,
+        data: DataInterface,
+        name: Optional[str] = None,
+        shape: Optional[ConvertibleToShape] = None,
+        tags: Optional[TagsType] = None) -> DataWrapper:
+    """Make a :class:`DataWrapper`.
+
+    :param namespace:  namespace
+    :param data:       an instance obeying the :class:`DataInterface`
+    :param name:       an optional name, generated automatically if not given
+    :param shape:      optional shape of the array, inferred from *data* if not given
+    :param tags:       implementation tags
+    """
+    if name is None:
+        name = namespace.name_gen("_pt_data")
+
+    if not name.isidentifier():
+        raise ValueError(f"'{name}' is not a Python identifier")
+
+    if shape is None:
+        shape = data.shape
+
+    shape = normalize_shape(shape, namespace)
+
+    return DataWrapper(namespace, name, data, shape, tags)
+
 # }}}
 
+
 # vim: foldmethod=marker
diff --git a/pytato/codegen.py b/pytato/codegen.py
index 5a7a7cb36d66ca0e98917f0dfee4cef3b7618688..ad4fbc545e3f04324fe94727a013a43d00360cd7 100644
--- a/pytato/codegen.py
+++ b/pytato/codegen.py
@@ -23,20 +23,27 @@ THE SOFTWARE.
 """
 
 import dataclasses
-from typing import (Union, Optional, Mapping, Dict, Tuple, FrozenSet, Set)
+from functools import partialmethod
+import re
+from typing import (
+        Union, Optional, Mapping, Dict, Tuple, FrozenSet, Set, Callable)
 
 import islpy as isl
 import loopy as lp
 import pymbolic.primitives as prim
+from pymbolic import var
 import pytools
 
 from pytato.array import (
-        Array, DictOfNamedArrays, Placeholder, ShapeType, IndexLambda)
+        Array, DictOfNamedArrays, ShapeType, IndexLambda,
+        SizeParam, DataWrapper, InputArgumentBase, MatrixProduct, Roll,
+        AxisPermutation, Slice, IndexRemappingBase, Stack, Placeholder,
+        Namespace, DataInterface)
 from pytato.program import BoundProgram
 from pytato.target import Target, PyOpenCLTarget
 import pytato.scalar_expr as scalar_expr
 from pytato.scalar_expr import ScalarExpression
-import pytato.transform
+from pytato.transform import Mapper, CopyMapper
 
 
 __doc__ = """
@@ -52,6 +59,8 @@ Code Generation Internals
 
 .. currentmodule:: pytato.codegen
 
+.. autoclass:: CodeGenPreprocessor
+
 .. autoclass:: LoopyExpressionContext
 .. autoclass:: ImplementedResult
 .. autoclass:: StoredResult
@@ -64,11 +73,129 @@ Code Generation Internals
 .. autoclass:: InlinedExpressionGenMapper
 
 .. autofunction:: domain_for_shape
-.. autofunction:: add_output
+.. autofunction:: get_loopy_temporary
+.. autofunction:: add_store
+.. autofunction:: rename_reductions
+.. autofunction:: get_initial_codegen_state
+.. autofunction:: preprocess
 
 """
 
 
+# {{{ preprocessing for codegen
+
+class CodeGenPreprocessor(CopyMapper):
+    """A mapper that preprocesses graphs to simplify code generation.
+
+    The following node simplifications are performed:
+
+    ======================================  =====================================
+    Source Node Type                        Target Node Type
+    ======================================  =====================================
+    :class:`~pytato.array.DataWrapper`      :class:`~pytato.array.Placeholder`
+    :class:`~pytato.array.Roll`             :class:`~pytato.array.IndexLambda`
+    :class:`~pytato.array.AxisPermutation`  :class:`~pytato.array.IndexLambda`
+    :class:`~pytato.array.Slice`            :class:`~pytato.array.IndexLambda`
+    ======================================  =====================================
+    """
+
+    # TODO:
+    # Stack -> IndexLambda
+    # MatrixProduct -> Einsum
+
+    def __init__(self, namespace: Namespace):
+        super().__init__(namespace)
+        self.bound_arguments: Dict[str, DataInterface] = {}
+
+    def map_data_wrapper(self, expr: DataWrapper) -> Array:
+        self.bound_arguments[expr.name] = expr.data
+        return Placeholder(namespace=self.namespace,
+                name=expr.name,
+                shape=expr.shape,
+                dtype=expr.dtype,
+                tags=expr.tags)
+
+    def map_stack(self, expr: Stack) -> Array:
+
+        def get_subscript(array_index: int) -> SymbolicIndex:
+            result = []
+            for i in range(expr.ndim):
+                if i != expr.axis:
+                    result.append(var(f"_{i}"))
+            return tuple(result)
+
+        # I = axis index
+        #
+        # => If(_I == 0,
+        #        _in0[_0, _1, ...],
+        #        If(_I == 1,
+        #            _in1[_0, _1, ...],
+        #            ...
+        #                _inNm1[_0, _1, ...] ...))
+        for i in range(len(expr.arrays) - 1, -1, -1):
+            subarray_expr = var(f"_in{i}")[get_subscript(i)]
+            if i == len(expr.arrays) - 1:
+                stack_expr = subarray_expr
+            else:
+                from pymbolic.primitives import If, Comparison
+                stack_expr = If(Comparison(var(f"_{expr.axis}"), "==", i),
+                        subarray_expr,
+                        stack_expr)
+
+        bindings = {f"_in{i}": self.rec(array)
+                for i, array in enumerate(expr.arrays)}
+
+        return IndexLambda(namespace=self.namespace,
+                expr=stack_expr,
+                shape=expr.shape,
+                dtype=expr.dtype,
+                bindings=bindings)
+
+    # {{{ index remapping (roll, axis permutation, slice)
+
+    def handle_index_remapping(self,
+            indices_getter: Callable[[CodeGenPreprocessor, Array], SymbolicIndex],
+            expr: IndexRemappingBase) -> Array:
+        indices = indices_getter(self, expr)
+
+        index_expr = var("_in0")
+        if indices:
+            index_expr = index_expr[indices]
+
+        array = self.rec(expr.array)
+
+        return IndexLambda(namespace=self.namespace,
+                expr=index_expr,
+                shape=expr.shape,
+                dtype=expr.dtype,
+                bindings=dict(_in0=array))
+
+    def _indices_for_roll(self, expr: Roll) -> SymbolicIndex:
+        indices = [var(f"_{d}") for d in range(expr.ndim)]
+        axis = expr.axis
+        indices[axis] = (indices[axis] - expr.shift) % expr.shape[axis]
+        return tuple(indices)
+
+    def _indices_for_axis_permutation(self, expr: AxisPermutation) -> SymbolicIndex:
+        indices = [None] * expr.ndim
+        for from_index, to_index in enumerate(expr.axes):
+            indices[to_index] = var(f"_{from_index}")
+        return tuple(indices)
+
+    def _indices_for_slice(self, expr: Slice) -> SymbolicIndex:
+        return tuple(var(f"_{d}") + expr.begin[d] for d in range(expr.ndim))
+
+    # https://github.com/python/mypy/issues/8619
+    map_roll = partialmethod(handle_index_remapping, _indices_for_roll)  # type: ignore  # noqa
+    map_axis_permutation = (
+            partialmethod(handle_index_remapping, _indices_for_axis_permutation))  # type: ignore  # noqa
+    map_slice = partialmethod(handle_index_remapping, _indices_for_slice)  # type: ignore  # noqa
+
+    # }}}
+
+# }}}
+
+
 # {{{ generated array expressions
 
 # SymbolicIndex and ShapeType are semantically distinct but identical at the
@@ -95,6 +222,11 @@ class LoopyExpressionContext(object):
         A (read-only) local name mapping used for name lookup when generating
         code.
 
+    .. attribute:: num_indices
+
+        The number of indices of the form ``_0``, ``_1``, allowed in the
+        expression.
+
     .. attribute:: depends_on
 
         The set of statement IDs that need to be included in
@@ -109,6 +241,7 @@ class LoopyExpressionContext(object):
 
     """
     state: CodeGenState
+    num_indices: int
     _depends_on: FrozenSet[str] = \
             dataclasses.field(default_factory=frozenset)
     local_namespace: Mapping[str, Array] = \
@@ -134,18 +267,25 @@ class ImplementedResult(object):
     """Generated code for a node in the computation graph (i.e., an array
     expression).
 
-    .. attribute:: array
-
-        The :class:`pytato.Array` associated with this code.
-
     .. automethod:: to_loopy_expression
     """
-    def __init__(self, array: Array):
-        self.array = array
 
     def to_loopy_expression(self, indices: SymbolicIndex,
             expr_context: LoopyExpressionContext) -> ScalarExpression:
-        """Return a :mod:`loopy` expression for this result."""
+        """Return a :mod:`loopy` expression for this result.
+
+        :param indices: symbolic expressions for the indices of the array
+        :param expr_context: the associated expression context. The fields are
+            treated as follows:
+
+            - *depends_on* is populated with any dependencies needed for the
+              generated expression.
+
+            - *reduction_bounds* is populated with reduction bounds for the
+              reduction inames in the returned expression. If
+              *reduction_bounds* is nonempty, then the returned inames are
+              ensured to be disjoint from those present.
+        """
         raise NotImplementedError
 
 
@@ -154,13 +294,15 @@ class StoredResult(ImplementedResult):
 
     See also: :class:`pytato.array.ImplStored`.
     """
-    def __init__(self, name: str, array: Array):
-        super().__init__(array)
+    def __init__(self, name: str, num_indices: int, depends_on: FrozenSet[str]):
         self.name = name
+        self.num_indices = num_indices
+        self.depends_on = depends_on
 
-    # TODO: Handle dependencies.
     def to_loopy_expression(self, indices: SymbolicIndex,
             expr_context: LoopyExpressionContext) -> ScalarExpression:
+        assert len(indices) == self.num_indices
+        expr_context.update_depends_on(self.depends_on)
         if indices == ():
             return prim.Variable(self.name)
         else:
@@ -173,16 +315,41 @@ class InlinedResult(ImplementedResult):
 
     See also: :class:`pytato.array.ImplInlined`.
     """
-    def __init__(self, expr: ScalarExpression, array: Array):
-        super().__init__(array)
+    def __init__(self, expr: ScalarExpression,
+            num_indices: int,
+            reduction_bounds: ReductionBounds,
+            depends_on: FrozenSet[str]):
         self.expr = expr
+        self.num_indices = num_indices
+        self.reduction_bounds = dict(reduction_bounds)
+        self.depends_on = depends_on
+
+    @staticmethod
+    def from_loopy_expression(
+            loopy_expr: ScalarExpression,
+            loopy_expr_context: LoopyExpressionContext) -> InlinedResult:
+        return InlinedResult(loopy_expr,
+                loopy_expr_context.num_indices,
+                loopy_expr_context.reduction_bounds,
+                loopy_expr_context.depends_on)
 
-    # TODO: Handle dependencies and reduction domains.
     def to_loopy_expression(self, indices: SymbolicIndex,
             expr_context: LoopyExpressionContext) -> ScalarExpression:
-        return scalar_expr.substitute(
-                self.expr,
-                {f"_{d}": i for d, i in zip(range(self.array.ndim), indices)})
+        assert len(indices) == self.num_indices
+        substitutions = {f"_{d}": i for d, i in enumerate(indices)}
+
+        reduction_start = len(expr_context.reduction_bounds)
+
+        # Rename reductions in expression not to conflict with those in expr_context.
+        for i, (old_name, bounds) in enumerate(self.reduction_bounds.items()):
+            new_name = f"_r{i + reduction_start}"
+            assert new_name not in expr_context.reduction_bounds
+            substitutions[old_name] = var(new_name)
+            expr_context.reduction_bounds[new_name] = bounds
+
+        expr_context.update_depends_on(self.depends_on)
+
+        return scalar_expr.substitute(self.expr, substitutions)
 
 
 class SubstitutionRuleResult(ImplementedResult):
@@ -235,7 +402,7 @@ class CodeGenState:
         self._kernel = kernel
 
 
-class CodeGenMapper(pytato.transform.Mapper):
+class CodeGenMapper(Mapper):
     """A mapper for generating code for nodes in the computation graph.
     """
     exprgen_mapper: InlinedExpressionGenMapper
@@ -243,19 +410,92 @@ class CodeGenMapper(pytato.transform.Mapper):
     def __init__(self) -> None:
         self.exprgen_mapper = InlinedExpressionGenMapper(self)
 
+    def map_size_param(self, expr: SizeParam,
+            state: CodeGenState) -> ImplementedResult:
+        if expr in state.results:
+            return state.results[expr]
+
+        arg = lp.ValueArg(expr.name, dtype=expr.dtype)
+        kernel = state.kernel.copy(args=state.kernel.args + [arg])
+        state.update_kernel(kernel)
+
+        result = StoredResult(expr.name, expr.ndim, frozenset())
+        state.results[expr] = result
+        return result
+
     def map_placeholder(self, expr: Placeholder,
             state: CodeGenState) -> ImplementedResult:
         if expr in state.results:
             return state.results[expr]
 
+        shape_context = LoopyExpressionContext(state, num_indices=0)
+        shape = []
+        for component in expr.shape:
+            shape.append(self.exprgen_mapper(component, shape_context))
+            # Data-dependent shape: Not supported yet.
+            assert not shape_context.depends_on
+            assert not shape_context.reduction_bounds
+
         arg = lp.GlobalArg(expr.name,
-                shape=expr.shape,
+                shape=tuple(shape),
                 dtype=expr.dtype,
                 order="C")
         kernel = state.kernel.copy(args=state.kernel.args + [arg])
         state.update_kernel(kernel)
 
-        result = StoredResult(expr.name, expr)
+        result = StoredResult(expr.name, expr.ndim, frozenset())
+        state.results[expr] = result
+        return result
+
+    def map_matrix_product(self, expr: MatrixProduct,
+            state: CodeGenState) -> ImplementedResult:
+        if expr in state.results:
+            return state.results[expr]
+
+        x1_result = self.rec(expr.x1, state)
+        x2_result = self.rec(expr.x2, state)
+
+        loopy_expr_context = LoopyExpressionContext(state,
+                num_indices=expr.ndim)
+        loopy_expr_context.reduction_bounds["_r0"] = (0, expr.x2.shape[0])
+
+        # Figure out inames.
+        x1_inames = []
+        for i in range(expr.x1.ndim):
+            if i == expr.x1.ndim - 1:
+                x1_inames.append(var("_r0"))
+            else:
+                x1_inames.append(var(f"_{i}"))
+        x2_inames = []
+        for i in range(expr.x2.ndim):
+            if i == 0:
+                x2_inames.append(var("_r0"))
+            else:
+                offset = i + len(x1_inames) - 2
+                x2_inames.append(var(f"_{offset}"))
+
+        inner_expr = x1_result.to_loopy_expression(
+                tuple(x1_inames), loopy_expr_context)
+        inner_expr *= x2_result.to_loopy_expression(
+                tuple(x2_inames), loopy_expr_context)
+
+        import loopy.library.reduction as red
+        loopy_expr = lp.Reduction(
+                operation=red.parse_reduction_op("sum"),
+                inames=("_r0",),
+                expr=inner_expr,
+                allow_simultaneous=False)
+
+        inlined_result = InlinedResult.from_loopy_expression(loopy_expr,
+                loopy_expr_context)
+
+        output_name = state.var_name_gen("matmul")
+
+        insn_id = add_store(output_name, expr, inlined_result, state,
+                output_to_temporary=True)
+
+        result = StoredResult(output_name, expr.ndim, frozenset([insn_id]))
+
         state.results[expr] = result
         return result
 
@@ -266,11 +506,13 @@ class CodeGenMapper(pytato.transform.Mapper):
 
         # TODO: Respect tags.
 
-        expr_context = LoopyExpressionContext(state,
-                local_namespace=expr.bindings)
-        loopy_expr = self.exprgen_mapper(expr.expr, expr_context)
+        loopy_expr_context = LoopyExpressionContext(state,
+                local_namespace=expr.bindings,
+                num_indices=expr.ndim)
+        loopy_expr = self.exprgen_mapper(expr.expr, loopy_expr_context)
 
-        result = InlinedResult(loopy_expr, expr)
+        result = InlinedResult.from_loopy_expression(loopy_expr,
+                loopy_expr_context)
         state.results[expr] = result
         return result
 
@@ -279,6 +521,9 @@ class CodeGenMapper(pytato.transform.Mapper):
 
 # {{{ inlined expression gen mapper
 
+INDEX_RE = re.compile("_(0|([1-9][0-9]*))")
+
+
 class InlinedExpressionGenMapper(scalar_expr.IdentityMapper):
     """A mapper for generating :mod:`loopy` expressions with inlined
     sub-expressions.
@@ -310,20 +555,31 @@ class InlinedExpressionGenMapper(scalar_expr.IdentityMapper):
 
     def map_variable(self, expr: prim.Variable,
             expr_context: LoopyExpressionContext) -> ScalarExpression:
-        result: ImplementedResult = self.codegen_mapper(
-                expr_context.lookup(expr.name),
-                expr_context.state)
-        return result.to_loopy_expression((), expr_context)
+        match = INDEX_RE.fullmatch(expr.name)
+        if match:
+            # Found an index of the form _0, _1, ...
+            index = int(match.group(1))
+            if not (0 <= index < expr_context.num_indices):
+                raise ValueError(f"invalid index encountered: _{index}")
+            return expr
+        else:
+            array = expr_context.lookup(expr.name)
+            impl_result: ImplementedResult = self.codegen_mapper(array,
+                    expr_context.state)
+            return impl_result.to_loopy_expression((), expr_context)
 
 # }}}
 
 
 # {{{ utils
 
-def domain_for_shape(
-        dim_names: Tuple[str, ...], shape: ShapeType) -> isl.BasicSet:
+def domain_for_shape(dim_names: Tuple[str, ...],
+         shape: ShapeType,
+         reductions: Dict[str, Tuple[ScalarExpression, ScalarExpression]],
+         ) -> isl.BasicSet:  # noqa
     """Create an :class:`islpy.BasicSet` that expresses an appropriate index domain
-    for an array of (potentially symbolic) shape *shape*.
+    for an array of (potentially symbolic) shape *shape* having reduction
+    dimensions *reductions*.
 
     :param dim_names: A tuple of strings, the names of the axes. These become set
         dimensions in the returned domain.
@@ -332,6 +588,10 @@ def domain_for_shape(
         expressions. The variables in these expressions become parameter
         dimensions in the returned set.  Must have the same length as
         *dim_names*.
+
+    :arg reductions: A map from reduction inames to (lower, upper) bounds
+        (as half-open integer ranges). The variables in the bounds become
+        parameter dimensions in the returned set.
     """
     assert len(dim_names) == len(shape)
 
@@ -340,7 +600,12 @@ def domain_for_shape(
     for sdep in map(scalar_expr.get_dependencies, shape):
         param_names_set |= sdep
 
-    set_names = sorted(dim_names)
+    for bounds in reductions.values():
+        for sdep in map(scalar_expr.get_dependencies, bounds):
+            # FIXME: Assumes that reduction bounds are not data-dependent.
+            param_names_set |= sdep
+
+    set_names = sorted(tuple(dim_names) + tuple(reductions))
     param_names = sorted(param_names_set)
 
     # Build domain.
@@ -357,51 +622,144 @@ def domain_for_shape(
         dom &= affs[0].le_set(affs[iname])
         dom &= affs[iname].lt_set(aff_from_expr(dom.space, dim))
 
+    for iname, (left, right) in reductions.items():
+        dom &= aff_from_expr(dom.space, left).le_set(affs[iname])
+        dom &= affs[iname].lt_set(aff_from_expr(dom.space, right))
+
     dom, = dom.get_basic_sets()
 
     return dom
 
 
-def add_output(name: str, expr: Array, state: CodeGenState,
-        mapper: CodeGenMapper) -> None:
-    """Add an output argument to the kernel.
-    """
-    # FIXE: Scalar outputs are not supported yet.
-    assert expr.shape != ()
+def add_store(name: str, expr: Array, result: ImplementedResult,
+       state: CodeGenState, output_to_temporary: bool = False) -> str:
+    """Add an instruction that stores to a variable in the kernel.
 
-    result = mapper(expr, state)
+    :param name: name of the output array, which is created
+    :param expr: the :class:`~pytato.Array` to store
+    :param result: the corresponding :class:`ImplementedResult`
+    :param state: code generation state
+    :param output_to_temporary: whether to generate an output argument (default)
+        or a temporary variable
 
+    :returns: the id of the generated instruction
+    """
+    # Get expression.
     inames = tuple(
             state.var_name_gen(f"{name}_dim{d}")
             for d in range(expr.ndim))
-    domain = domain_for_shape(inames, expr.shape)
-
-    arg = lp.GlobalArg(name,
-            shape=expr.shape,
-            dtype=expr.dtype,
-            order="C",
-            is_output_only=True)
-
     indices = tuple(prim.Variable(iname) for iname in inames)
-    expr_context = LoopyExpressionContext(state)
-    copy_expr = result.to_loopy_expression(indices, expr_context)
+    loopy_expr_context = LoopyExpressionContext(state, num_indices=0)
+    loopy_expr = result.to_loopy_expression(indices, loopy_expr_context)
 
-    # TODO: Contextual data not supported yet.
-    assert not expr_context.reduction_bounds
-    assert not expr_context.depends_on
+    # Rename reduction variables to names suitable as inames.
+    loopy_expr = rename_reductions(
+            loopy_expr, loopy_expr_context,
+            lambda old_name: state.var_name_gen(f"{name}{old_name}"))
 
+    # Make the instruction
     from loopy.kernel.instruction import make_assignment
-    insn = make_assignment((prim.Variable(name)[indices],),
-            copy_expr,
-            id=state.insn_id_gen(f"{name}_copy"),
+    if indices:
+        assignee = prim.Variable(name)[indices]
+    else:
+        assignee = prim.Variable(name)
+    insn_id = state.insn_id_gen(f"{name}_store")
+    insn = make_assignment((assignee,),
+            loopy_expr,
+            id=insn_id,
             within_inames=frozenset(inames),
-            depends_on=expr_context.depends_on)
+            depends_on=loopy_expr_context.depends_on)
 
+    # Get the domain.
+    domain = domain_for_shape(inames, expr.shape,
+            loopy_expr_context.reduction_bounds)
+
+    # Update the kernel.
     kernel = state.kernel
-    kernel = kernel.copy(args=kernel.args + [arg],
-            instructions=kernel.instructions + [insn],
-            domains=kernel.domains + [domain])
+
+    if output_to_temporary:
+        tvar = get_loopy_temporary(name, expr)
+        temporary_variables = kernel.temporary_variables.copy()
+        temporary_variables[name] = tvar
+        kernel = kernel.copy(temporary_variables=temporary_variables,
+                domains=kernel.domains + [domain],
+                instructions=kernel.instructions + [insn])
+    else:
+        arg = lp.GlobalArg(name,
+                shape=expr.shape,
+                dtype=expr.dtype,
+                order="C",
+                is_output_only=True)
+        kernel = kernel.copy(args=kernel.args + [arg],
+                domains=kernel.domains + [domain],
+                instructions=kernel.instructions + [insn])
+
     state.update_kernel(kernel)
+    return insn_id
+
+
+def get_loopy_temporary(name: str, expr: Array) -> lp.TemporaryVariable:
+    is_shape_symbolic = not all(isinstance(dim, int) for dim in expr.shape)
+    # Only global variables can have symbolic shape.
+    address_space = lp.AddressSpace.GLOBAL if is_shape_symbolic else lp.auto
+    return lp.TemporaryVariable(name,
+            dtype=expr.dtype,
+            shape=expr.shape,
+            address_space=address_space)
+
+
+def rename_reductions(
+        loopy_expr: ScalarExpression,
+        loopy_expr_context: LoopyExpressionContext,
+        var_name_gen: Callable[[str], str]) -> ScalarExpression:
+    """Rename the reduction variables in *loopy_expr* and *loopy_expr_context*
+    using the callable *var_name_gen.*
+    """
+    new_reduction_inames = tuple(
+            var_name_gen(old_iname)
+            for old_iname in loopy_expr_context.reduction_bounds)
+
+    substitutions = dict(zip(
+            loopy_expr_context.reduction_bounds,
+            map(var, new_reduction_inames)))
+
+    result = scalar_expr.substitute(loopy_expr, substitutions)
+
+    new_reduction_bounds = {
+            substitutions[old_iname].name: bounds
+            for old_iname, bounds in loopy_expr_context.reduction_bounds.items()}
+
+    loopy_expr_context.reduction_bounds = new_reduction_bounds
+    return result
+
+
+def get_initial_codegen_state(namespace: Namespace, target: Target,
+        options: Optional[lp.Options]) -> CodeGenState:
+    kernel = lp.make_kernel("{:}", [],
+            target=target.get_loopy_target(),
+            options=options,
+            lang_version=lp.MOST_RECENT_LANGUAGE_VERSION)
+
+    return CodeGenState(namespace=namespace,
+            _kernel=kernel,
+            results=dict())
+
+
+@dataclasses.dataclass(init=True, repr=False, eq=False)
+class PreprocessResult:
+    outputs: DictOfNamedArrays
+    bound_arguments: Dict[str, DataInterface]
+
+
+def preprocess(outputs: DictOfNamedArrays) -> PreprocessResult:
+    """Preprocess a computation for code generation."""
+    mapper = CodeGenPreprocessor(Namespace())
+
+    from pytato.transform import copy_dict_of_named_arrays
+    new_outputs = copy_dict_of_named_arrays(outputs, mapper)
+
+    return PreprocessResult(outputs=new_outputs,
+            bound_arguments=mapper.bound_arguments)
 
 # }}}
 
@@ -418,16 +776,14 @@ def generate_loopy(result: Union[Array, DictOfNamedArrays],
     """
     # {{{ get namespace and outputs
 
-    outputs: DictOfNamedArrays
+    orig_outputs: DictOfNamedArrays
 
     if isinstance(result, Array):
-        outputs = DictOfNamedArrays({"_pt_out": result})
-        namespace = outputs.namespace
+        orig_outputs = DictOfNamedArrays({"_pt_out": result})
     else:
         assert isinstance(result, DictOfNamedArrays)
-        outputs = result
+        orig_outputs = result
 
-    namespace = outputs.namespace
     del result
 
     # }}}
@@ -435,29 +791,27 @@ def generate_loopy(result: Union[Array, DictOfNamedArrays],
     if target is None:
         target = PyOpenCLTarget()
 
-    # Set up codegen state.
-    kernel = lp.make_kernel("{:}", [],
-            target=target.get_loopy_target(),
-            options=options,
-            lang_version=lp.MOST_RECENT_LANGUAGE_VERSION)
+    preproc_result = preprocess(orig_outputs)
+    outputs = preproc_result.outputs
+    namespace = outputs.namespace
 
-    state = CodeGenState(namespace=namespace,
-            _kernel=kernel,
-            results=dict())
+    state = get_initial_codegen_state(namespace, target, options)
 
     # Reserve names of input and output arguments.
     for val in namespace.values():
-        if isinstance(val, Placeholder):
+        if isinstance(val, InputArgumentBase):
             state.var_name_gen.add_name(val.name)
     state.var_name_gen.add_names(outputs)
 
     # Generate code for graph nodes.
-    mapper = CodeGenMapper()
+    cg_mapper = CodeGenMapper()
     for name, val in namespace.items():
-        _ = mapper(val, state)
+        _ = cg_mapper(val, state)
 
     # Generate code for outputs.
     for name, expr in outputs.items():
-        add_output(name, expr, state, mapper)
+        add_store(name, expr, cg_mapper(expr, state), state)
 
-    return target.bind_program(program=state.kernel, bound_arguments=dict())
+    return target.bind_program(
+            program=state.kernel,
+            bound_arguments=preproc_result.bound_arguments)
diff --git a/pytato/scalar_expr.py b/pytato/scalar_expr.py
index a09e9db9eddb5563c36ff1348982301039bb0f89..0cbc33935004f0a07f34bb4ca51a249b5f8d739b 100644
--- a/pytato/scalar_expr.py
+++ b/pytato/scalar_expr.py
@@ -25,8 +25,9 @@ THE SOFTWARE.
 """
 
 from numbers import Number
-from typing import Any, Union, Mapping, FrozenSet
+from typing import Any, Union, Mapping, FrozenSet, Set
 
+from loopy.symbolic import Reduction
 from pymbolic.mapper import (WalkMapper as WalkMapperBase, IdentityMapper as
         IdentityMapperBase)
 from pymbolic.mapper.substitutor import (SubstitutionMapper as
@@ -70,19 +71,60 @@ def parse(s: str) -> ScalarExpression:
 # {{{ mapper classes
 
 class WalkMapper(WalkMapperBase):
-    pass
+
+    def map_reduction(self, expr: Reduction, *args: Any, **kwargs: Any) -> None:
+        if not self.visit(expr, *args, **kwargs):
+            return
+
+        self.rec(expr.expr, *args, **kwargs)
 
 
 class IdentityMapper(IdentityMapperBase):
-    pass
+
+    def map_reduction(self, expr: Reduction,
+            *args: Any, **kwargs: Any) -> Reduction:
+        new_inames = []
+        for iname in expr.inames:
+            new_iname = self.rec(prim.Variable(iname), *args, **kwargs)
+            if not isinstance(new_iname, prim.Variable):
+                raise ValueError(
+                        f"reduction iname {iname} can only be renamed"
+                        " to another iname")
+            new_inames.append(new_iname.name)
+
+        return Reduction(expr.operation,
+                tuple(new_inames),
+                self.rec(expr.expr, *args, **kwargs),
+                allow_simultaneous=expr.allow_simultaneous)
 
 
 class SubstitutionMapper(SubstitutionMapperBase):
-    pass
+
+    def map_reduction(self, expr: Reduction) -> Reduction:
+        new_inames = []
+        for iname in expr.inames:
+            new_iname = self.subst_func(iname)
+            if new_iname is None:
+                new_iname = prim.Variable(iname)
+            else:
+                if not isinstance(new_iname, prim.Variable):
+                    raise ValueError(
+                            f"reduction iname {iname} can only be renamed"
+                            " to another iname")
+            new_inames.append(new_iname.name)
+
+        return Reduction(expr.operation,
+                tuple(new_inames),
+                self.rec(expr.expr),
+                allow_simultaneous=expr.allow_simultaneous)
 
 
 class DependencyMapper(DependencyMapperBase):
-    pass
+
+    def map_reduction(self, expr: Reduction,
+            *args: Any, **kwargs: Any) -> Set[prim.Variable]:
+        deps: Set[prim.Variable] = self.rec(expr.expr, *args, **kwargs)
+        return deps - set(prim.Variable(iname) for iname in expr.inames)
 
 # }}}
 
diff --git a/pytato/transform.py b/pytato/transform.py
index 13546653188a6da64a5f65b1ef3ed68832e115b3..19739745550bd29cac72580af84773d07f610e3f 100644
--- a/pytato/transform.py
+++ b/pytato/transform.py
@@ -24,9 +24,11 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from typing import Any, Callable
+from typing import Any, Callable, Dict
 
-from pytato.array import Array
+from pytato.array import (
+        Array, IndexLambda, Namespace, Placeholder, MatrixProduct, Stack,
+        Roll, AxisPermutation, Slice, DataWrapper, SizeParam, DictOfNamedArrays)
 
 __doc__ = """
 .. currentmodule:: pytato.transform
@@ -34,7 +36,9 @@ __doc__ = """
 Transforming Computations
 -------------------------
 
-.. autoclass:: Mapper
+.. autoclass:: CopyMapper
+.. autofunction:: copy_namespace
+.. autofunction:: copy_dict_of_named_arrays
 
 """
 
@@ -59,11 +63,11 @@ class Mapper:
         raise ValueError("%s encountered invalid foreign object: %s"
                 % (type(self).__name__, repr(expr)))
 
-    def __call__(self, expr: Array, *args: Any, **kwargs: Any) -> Any:
+    def rec(self, expr: Array, *args: Any, **kwargs: Any) -> Any:
         method: Callable[..., Array]
 
         try:
-            method = getattr(self, expr.mapper_method)
+            method = getattr(self, expr._mapper_method)
         except AttributeError:
             if isinstance(expr, Array):
                 return self.handle_unsupported_array(expr, *args, **kwargs)
@@ -72,9 +76,116 @@ class Mapper:
 
         return method(expr, *args, **kwargs)
 
-    rec = __call__
+    def __call__(self, expr: Array, *args: Any, **kwargs: Any) -> Any:
+        return self.rec(expr, *args, **kwargs)
+
+
+class CopyMapper(Mapper):
+
+    def __init__(self, namespace: Namespace):
+        self.namespace = namespace
+        self.cache: Dict[Array, Array] = {}
+
+    def rec(self, expr: Array) -> Array:  # type: ignore
+        if expr in self.cache:
+            return self.cache[expr]
+        result: Array = super().rec(expr)
+        self.cache[expr] = result
+        return result
+
+    def map_index_lambda(self, expr: IndexLambda) -> Array:
+        bindings = {
+                name: self.rec(subexpr)
+                for name, subexpr in expr.bindings.items()}
+        return IndexLambda(namespace=self.namespace,
+                expr=expr.expr,
+                shape=expr.shape,
+                dtype=expr.dtype,
+                bindings=bindings,
+                tags=expr.tags)
+
+    def map_placeholder(self, expr: Placeholder) -> Array:
+        return Placeholder(namespace=self.namespace,
+                name=expr.name,
+                shape=expr.shape,
+                dtype=expr.dtype,
+                tags=expr.tags)
+
+    def map_matrix_product(self, expr: MatrixProduct) -> Array:
+        return MatrixProduct(x1=self.rec(expr.x1),
+                x2=self.rec(expr.x2),
+                tags=expr.tags)
+
+    def map_stack(self, expr: Stack) -> Array:
+        arrays = tuple(self.rec(arr) for arr in expr.arrays)
+        return Stack(arrays=arrays, axis=expr.axis, tags=expr.tags)
+
+    def map_roll(self, expr: Roll) -> Array:
+        return Roll(array=self.rec(expr.array),
+                shift=expr.shift,
+                axis=expr.axis,
+                tags=expr.tags)
+
+    def map_axis_permutation(self, expr: AxisPermutation) -> Array:
+        return AxisPermutation(array=self.rec(expr.array),
+                axes=expr.axes,
+                tags=expr.tags)
+
+    def map_slice(self, expr: Slice) -> Array:
+        return Slice(array=self.rec(expr.array),
+                begin=expr.begin,
+                size=expr.size,
+                tags=expr.tags)
+
+    def map_data_wrapper(self, expr: DataWrapper) -> Array:
+        return DataWrapper(namespace=self.namespace,
+                name=expr.name,
+                data=expr.data,
+                shape=expr.shape,
+                tags=expr.tags)
+
+    def map_size_param(self, expr: SizeParam) -> Array:
+        return SizeParam(self.namespace, name=expr.name, tags=expr.tags)
 
 # }}}
 
 
+# {{{ mapper frontends
+
+def copy_namespace(source_namespace: Namespace,
+        copy_mapper: CopyMapper) -> Namespace:
+    """Copy the elements of *namespace* into a new namespace.
+
+    :param source_namespace: The namespace to copy
+    :param copy_mapper: A mapper that performs copies into a new namespace
+    :returns: A new namespace containing copies of the items in *source_namespace*
+    """
+    for name, val in source_namespace.items():
+        mapped_val = copy_mapper(val)
+        if name not in copy_mapper.namespace:
+            copy_mapper.namespace.assign(name, mapped_val)
+    return copy_mapper.namespace
+
+
+def copy_dict_of_named_arrays(source_dict: DictOfNamedArrays,
+        copy_mapper: CopyMapper) -> DictOfNamedArrays:
+    """Copy the elements of a :class:`~pytato.DictOfNamedArrays` into a
+    :class:`~pytato.DictOfNamedArrays` with a new namespace.
+
+    :param source_dict: The :class:`~pytato.DictOfNamedArrays` to copy
+    :param copy_mapper: A mapper that performs copies into a new namespace
+    :returns: A new :class:`~pytato.DictOfNamedArrays` containing copies of the
+        items in *source_dict*
+    """
+    if not source_dict:
+        return DictOfNamedArrays({})
+
+    data = {}
+    copy_namespace(source_dict.namespace, copy_mapper)
+    for name, val in source_dict.items():
+        data[name] = copy_mapper(val)
+    return DictOfNamedArrays(data)
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/setup.py b/setup.py
index 6bc7b74adc3c4986749f248b239b5859ac1e3bb5..094aa6e632b101742659c7b3619a440a8bf77ef3 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,7 @@ setup(name="pytato",
           'Topic :: Software Development :: Libraries',
           ],
 
-      python_requires="~=3.6",
+      python_requires="~=3.8",
       install_requires=[
           "loo.py",
           ],
diff --git a/test/test_codegen.py b/test/test_codegen.py
index 0f26bc0c1881a2b5e2b45844004fa9ed545bfafd..4d901b27d1e34864e4765d47bbec38c2279f266c 100755
--- a/test/test_codegen.py
+++ b/test/test_codegen.py
@@ -22,6 +22,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+import itertools
+import operator
 import sys
 
 import loopy as lp
@@ -35,6 +37,7 @@ from pyopencl.tools import (  # noqa
 import pytest  # noqa
 
 import pytato as pt
+from pytato.array import Placeholder
 
 
 def test_basic_codegen(ctx_factory):
@@ -42,20 +45,89 @@ def test_basic_codegen(ctx_factory):
     queue = cl.CommandQueue(ctx)
 
     namespace = pt.Namespace()
-    x = pt.Placeholder(namespace, "x", (5,), np.int)
+    x = Placeholder(namespace, "x", (5,), np.int)
     prog = pt.generate_loopy(x * x, target=pt.PyOpenCLTarget(queue))
     x_in = np.array([1, 2, 3, 4, 5])
     _, (out,) = prog(x=x_in)
     assert (out == x_in * x_in).all()
 
 
+def test_scalar_placeholder(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    namespace = pt.Namespace()
+    x = Placeholder(namespace, "x", (), np.int)
+    prog = pt.generate_loopy(x, target=pt.PyOpenCLTarget(queue))
+    x_in = np.array(1)
+    _, (x_out,) = prog(x=x_in)
+    assert np.array_equal(x_out, x_in)
+
+
+def test_size_param(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    namespace = pt.Namespace()
+    n = pt.make_size_param(namespace, "n")
+    pt.make_placeholder(namespace, "x", "(n,)", np.int)
+    prog = pt.generate_loopy(n, target=pt.PyOpenCLTarget(queue))
+    x_in = np.array([1, 2, 3, 4, 5])
+    _, (n_out,) = prog(x=x_in)
+    assert n_out == 5
+
+
+@pytest.mark.parametrize("x1_ndim", (1, 2))
+@pytest.mark.parametrize("x2_ndim", (1, 2))
+def test_matmul(ctx_factory, x1_ndim, x2_ndim):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    def get_array(ndim):
+        arr = np.array([[1, 2], [3, 4]])
+        return arr[(0,) * (arr.ndim - ndim)]
+
+    x1_in = get_array(x1_ndim)
+    x2_in = get_array(x2_ndim)
+
+    namespace = pt.Namespace()
+    x1 = pt.make_data_wrapper(namespace, x1_in)
+    x2 = pt.make_data_wrapper(namespace, x2_in)
+    prog = pt.generate_loopy(x1 @ x2, target=pt.PyOpenCLTarget(queue))
+    _, (out,) = prog()
+
+    assert (out == x1_in @ x2_in).all()
+
+
+def test_data_wrapper(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    # Without name/shape
+    namespace = pt.Namespace()
+    x_in = np.array([1, 2, 3, 4, 5])
+    x = pt.make_data_wrapper(namespace, x_in)
+    prog = pt.generate_loopy(x, target=pt.PyOpenCLTarget(queue))
+    _, (x_out,) = prog()
+    assert (x_out == x_in).all()
+
+    # With name/shape
+    namespace = pt.Namespace()
+    x_in = np.array([[1, 2], [3, 4], [5, 6]])
+    pt.make_size_param(namespace, "n")
+    x = pt.make_data_wrapper(namespace, x_in, name="x", shape="(n, 2)")
+    prog = pt.generate_loopy(x, target=pt.PyOpenCLTarget(queue))
+    _, (x_out,) = prog()
+    assert (x_out == x_in).all()
+
+
 def test_codegen_with_DictOfNamedArrays(ctx_factory):  # noqa
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
     namespace = pt.Namespace()
-    x = pt.Placeholder(namespace, "x", (5,), np.int)
-    y = pt.Placeholder(namespace, "y", (5,), np.int)
+    x = Placeholder(namespace, "x", (5,), np.int)
+    y = Placeholder(namespace, "y", (5,), np.int)
     x_in = np.array([1, 2, 3, 4, 5])
     y_in = np.array([6, 7, 8, 9, 10])
 
@@ -77,6 +149,262 @@ def test_codegen_with_DictOfNamedArrays(ctx_factory):  # noqa
     assert (outputs["y_out"] == y_in).all()
 
 
+@pytest.mark.parametrize("shift", (-1, 1, 0, -20, 20))
+@pytest.mark.parametrize("axis", (0, 1))
+def test_roll(ctx_factory, shift, axis):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+
+    namespace = pt.Namespace()
+    pt.make_size_param(namespace, "n")
+    x = pt.make_placeholder(namespace, "x", shape=("n", "n"), dtype=np.float)
+
+    prog = pt.generate_loopy(
+            pt.roll(x, shift=shift, axis=axis),
+            target=pt.PyOpenCLTarget(queue))
+
+    x_in = np.arange(1., 10.).reshape(3, 3)
+
+    _, (x_out,) = prog(x=x_in)
+
+    assert (x_out == np.roll(x_in, shift=shift, axis=axis)).all()
+
+
+@pytest.mark.parametrize("axes", (
+    (), (0, 1), (1, 0),
+    (0, 1, 2), (0, 2, 1), (1, 0, 2), (1, 2, 0), (2, 0, 1), (2, 1, 0)))
+def test_axis_permutation(ctx_factory, axes):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+
+    ndim = len(axes)
+    shape = (3, 4, 5)[:ndim]
+
+    from numpy.random import default_rng
+    rng = default_rng()
+
+    x_in = rng.random(size=shape)
+
+    namespace = pt.Namespace()
+    x = pt.make_data_wrapper(namespace, x_in)
+    prog = pt.generate_loopy(
+            pt.transpose(x, axes),
+            target=pt.PyOpenCLTarget(queue))
+
+    _, (x_out,) = prog()
+    assert (x_out == np.transpose(x_in, axes)).all()
+
+
+def test_transpose(ctx_factory):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+
+    shape = (2, 8)
+
+    from numpy.random import default_rng
+    rng = default_rng()
+    x_in = rng.random(size=shape)
+
+    namespace = pt.Namespace()
+    x = pt.make_data_wrapper(namespace, x_in)
+    prog = pt.generate_loopy(x.T, target=pt.PyOpenCLTarget(queue))
+
+    _, (x_out,) = prog()
+    assert (x_out == x_in.T).all()
+
+
+# Doesn't include: ? (boolean), g (float128), G (complex256)
+ARITH_DTYPES = "bhilqpBHILQPfdFD"
+
+
+def reverse_args(f):
+    def wrapper(*args):
+        return f(*reversed(args))
+    return wrapper
+
+
+@pytest.mark.parametrize("which", ("add", "sub", "mul", "truediv"))
+@pytest.mark.parametrize("reverse", (False, True))
+def test_scalar_array_binary_arith(ctx_factory, which, reverse):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+
+    op = getattr(operator, which)
+    if reverse:
+        op = reverse_args(op)
+
+    x_orig = 7
+    y_orig = np.array([1, 2, 3, 4, 5])
+
+    for first_dtype in (int, float, complex):
+        namespace = pt.Namespace()
+        x_in = first_dtype(x_orig)
+
+        exprs = {}
+        for dtype in ARITH_DTYPES:
+            y = pt.make_data_wrapper(namespace,
+                    y_orig.astype(dtype), name=f"y{dtype}")
+            exprs[dtype] = op(x_in, y)
+
+        prog = pt.generate_loopy(pt.make_dict_of_named_arrays(exprs),
+                target=pt.PyOpenCLTarget(queue),
+                options=lp.Options(return_dict=True))
+
+        _, outputs = prog()
+
+        for dtype in exprs:
+            out = outputs[dtype]
+            out_ref = op(x_in, y_orig.astype(dtype))
+
+            assert out.dtype == out_ref.dtype, (out.dtype, out_ref.dtype)
+            # In some cases ops are done in float32 in loopy but float64 in numpy.
+            assert np.allclose(out, out_ref), (out, out_ref)
+
+
+@pytest.mark.parametrize("which", ("add", "sub", "mul", "truediv"))
+@pytest.mark.parametrize("reverse", (False, True))
+def test_array_array_binary_arith(ctx_factory, which, reverse):
+    if which == "sub":
+        pytest.skip("https://github.com/inducer/loopy/issues/131")
+
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+
+    op = getattr(operator, which)
+    if reverse:
+        op = reverse_args(op)
+
+    x_orig = np.array([1, 2, 3, 4, 5])
+    y_orig = np.array([10, 9, 8, 7, 6])
+
+    for first_dtype in ARITH_DTYPES:
+        namespace = pt.Namespace()
+        x_in = x_orig.astype(first_dtype)
+        x = pt.make_data_wrapper(namespace, x_in, name="x")
+
+        exprs = {}
+        for dtype in ARITH_DTYPES:
+            y = pt.make_data_wrapper(namespace,
+                    y_orig.astype(dtype), name=f"y{dtype}")
+            exprs[dtype] = op(x, y)
+
+        prog = pt.generate_loopy(pt.make_dict_of_named_arrays(exprs),
+                target=pt.PyOpenCLTarget(queue),
+                options=lp.Options(return_dict=True))
+
+        _, outputs = prog()
+
+        for dtype in ARITH_DTYPES:
+            out = outputs[dtype]
+            out_ref = op(x_in, y_orig.astype(dtype))
+
+            assert out.dtype == out_ref.dtype, (out.dtype, out_ref.dtype)
+            # In some cases ops are done in float32 in loopy but float64 in numpy.
+            assert np.allclose(out, out_ref), (out, out_ref)
+
+
+@pytest.mark.parametrize("which", ("neg", "pos"))
+def test_unary_arith(ctx_factory, which):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+
+    op = getattr(operator, which)
+
+    x_orig = np.array([1, 2, 3, 4, 5])
+    namespace = pt.Namespace()
+
+    exprs = {}
+    for dtype in ARITH_DTYPES:
+        exprs[dtype] = op(
+                pt.make_data_wrapper(namespace, x_orig.astype(dtype)))
+
+    prog = pt.generate_loopy(pt.make_dict_of_named_arrays(exprs),
+            target=pt.PyOpenCLTarget(queue),
+            options=lp.Options(return_dict=True))
+
+    _, outputs = prog()
+
+    for dtype in ARITH_DTYPES:
+        out = outputs[dtype]
+        out_ref = op(x_orig.astype(dtype))
+
+        assert out.dtype == out_ref.dtype
+        assert np.array_equal(out, out_ref)
+
+
+def generate_test_slices_for_dim(dim_bound):
+    # Include scalars to test indexing.
+    for i in range(dim_bound):
+        yield i
+
+    for i in range(0, dim_bound):
+        for j in range(i + 1, 1 + dim_bound):
+            yield slice(i, j, None)
+
+
+def generate_test_slices(shape):
+    yield from itertools.product(*map(generate_test_slices_for_dim, shape))
+
+
+@pytest.mark.parametrize("shape", [(3,), (2, 2), (1, 2, 1)])
+def test_slice(ctx_factory, shape):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+
+    from numpy.random import default_rng
+    rng = default_rng()
+
+    x_in = rng.random(size=shape)
+    namespace = pt.Namespace()
+    x = pt.make_data_wrapper(namespace, x_in)
+
+    outputs = {}
+    ref_outputs = {}
+
+    i = 0
+    for slice_ in generate_test_slices(shape):
+        outputs[f"out_{i}"] = x[slice_]
+        ref_outputs[f"out_{i}"] = x_in[slice_]
+        i += 1
+
+    prog = pt.generate_loopy(
+            pt.make_dict_of_named_arrays(outputs),
+            target=pt.PyOpenCLTarget(queue),
+            options=lp.Options(return_dict=True))
+
+    _, outputs = prog()
+
+    for output in outputs:
+        x_out = outputs[output]
+        x_ref = ref_outputs[output]
+        assert (x_out == x_ref).all()
+
+
+@pytest.mark.parametrize("input_dims", (1, 2, 3))
+def test_stack(ctx_factory, input_dims):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+
+    shape = (2, 2, 2)[:input_dims]
+
+    from numpy.random import default_rng
+    rng = default_rng()
+    x_in = rng.random(size=shape)
+    y_in = rng.random(size=shape)
+
+    namespace = pt.Namespace()
+    x = pt.make_data_wrapper(namespace, x_in)
+    y = pt.make_data_wrapper(namespace, y_in)
+
+    for axis in range(0, 1 + input_dims):
+        prog = pt.generate_loopy(
+                pt.stack((x, y), axis=axis),
+                target=pt.PyOpenCLTarget(queue))
+
+        _, (out,) = prog()
+        assert (out == np.stack((x_in, y_in), axis=axis)).all()
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_pytato.py b/test/test_pytato.py
index 8b8e68d9be70cacdb52363e5f4e02b146b977f9b..49444fa599e3a8940c195020d20203f24e1d3598 100755
--- a/test/test_pytato.py
+++ b/test/test_pytato.py
@@ -23,17 +23,98 @@ THE SOFTWARE.
 """
 
 import sys
-import pytest  # noqa
 
+import numpy as np
+import pytest
 
-def test_being_very_lazy():
-    # Placeholder to make CI pass. Remove after tests are added.
-    pass
+import pytato as pt
+
+
+def test_matmul_input_validation():
+    namespace = pt.Namespace()
+
+    a = pt.make_placeholder(namespace, "a", (10, 10), np.float)
+    b = pt.make_placeholder(namespace, "b", (20, 10), np.float)
+
+    with pytest.raises(ValueError):
+        a @ b
+
+    c = pt.make_placeholder(namespace, "c", (), np.float)
+    with pytest.raises(ValueError):
+        c @ c
+
+    pt.make_size_param(namespace, "n")
+    d = pt.make_placeholder(namespace, "d", "(n, n)", np.float)
+    d @ d
+
+
+def test_roll_input_validation():
+    namespace = pt.Namespace()
+
+    a = pt.make_placeholder(namespace, "a", (10, 10), np.float)
+    pt.roll(a, 1, axis=0)
+
+    with pytest.raises(ValueError):
+        pt.roll(a, 1, axis=2)
+
+    with pytest.raises(ValueError):
+        pt.roll(a, 1, axis=-1)
+
+
+def test_transpose_input_validation():
+    namespace = pt.Namespace()
+
+    a = pt.make_placeholder(namespace, "a", (10, 10), np.float)
+    pt.transpose(a)
+
+    with pytest.raises(ValueError):
+        pt.transpose(a, (2, 0, 1))
+
+    with pytest.raises(ValueError):
+        pt.transpose(a, (1, 1))
+
+    with pytest.raises(ValueError):
+        pt.transpose(a, (0,))
+
+
+def test_slice_input_validation():
+    namespace = pt.Namespace()
+
+    a = pt.make_placeholder(namespace, "a", (10, 10, 10), np.float)
+
+    a[0]
+    a[0, 0]
+    a[0, 0, 0]
+
+    with pytest.raises(ValueError):
+        a[0, 0, 0, 0]
+
+    with pytest.raises(ValueError):
+        a[10]
+
+
+def test_stack_input_validation():
+    namespace = pt.Namespace()
+
+    x = pt.make_placeholder(namespace, "x", (10, 10), np.float)
+    y = pt.make_placeholder(namespace, "y", (1, 10), np.float)
+
+    assert pt.stack((x, x, x), axis=0).shape == (3, 10, 10)
+
+    pt.stack((x,), axis=0)
+    pt.stack((x,), axis=1)
+
+    with pytest.raises(ValueError):
+        pt.stack(())
+
+    with pytest.raises(ValueError):
+        pt.stack((x, y))
+
+    with pytest.raises(ValueError):
+        pt.stack((x, x), axis=3)
 
 
 if __name__ == "__main__":
-    # make sure that import failures get reported, instead of skipping the
-    # tests.
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
diff --git a/test/test_transform.py b/test/test_transform.py
new file mode 100755
index 0000000000000000000000000000000000000000..cf69e2949f3bdc39b8416a5616cc053e5fde5d91
--- /dev/null
+++ b/test/test_transform.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+
+__copyright__ = "Copyright (C) 2020 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import sys
+
+import numpy as np
+import pyopencl as cl  # noqa
+import pyopencl.array as cl_array  # noqa
+import pyopencl.cltypes as cltypes  # noqa
+import pyopencl.tools as cl_tools  # noqa
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+import pytest  # noqa
+
+import pytato as pt
+
+
+def test_namespace_copy():
+    namespace = pt.Namespace()
+    x = pt.Placeholder(namespace, "x", (5,), np.int)
+    namespace.assign("xsquared", x * x)
+
+    namespace_copy = namespace.copy()
+    assert len(namespace_copy) == 2
+    assert isinstance(namespace_copy["x"], pt.Placeholder)
+    assert isinstance(namespace_copy["xsquared"], pt.IndexLambda)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+# vim: filetype=pyopencl:fdm=marker