diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d658c057a5d965d98a8136fe7bc876ad8658883e..75f714fd09474c56fb155b5e10be0c7dc3c90faf 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -15,12 +15,13 @@ Python 2.7 POCL:
   script:
   - export PY_EXE=python2.7
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="numpy mako"
+  - export EXTRA_INSTALL="numpy mako mpi4py"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
   - python2.7
   - pocl
+  - mpi
   except:
   - tags
 
@@ -28,12 +29,13 @@ Python 3.5 POCL:
   script:
   - export PY_EXE=python3.5
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="numpy mako"
+  - export EXTRA_INSTALL="numpy mako mpi4py"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
   - python3.5
   - pocl
+  - mpi
   except:
   - tags
 
diff --git a/meshmode/discretization/connection/__init__.py b/meshmode/discretization/connection/__init__.py
index bd4f1292a083ade8273f5f92e01efebd66d652e8..5f136982d799b14ccaddef05e700dc0ec75b5156 100644
--- a/meshmode/discretization/connection/__init__.py
+++ b/meshmode/discretization/connection/__init__.py
@@ -38,7 +38,7 @@ from meshmode.discretization.connection.face import (
         make_face_restriction,
         make_face_to_all_faces_embedding)
 from meshmode.discretization.connection.opposite_face import \
-        make_opposite_face_connection
+        make_opposite_face_connection, make_partition_connection
 from meshmode.discretization.connection.refinement import \
         make_refinement_connection
 from meshmode.discretization.connection.chained import \
@@ -55,8 +55,9 @@ __all__ = [
         "make_face_restriction",
         "make_face_to_all_faces_embedding",
         "make_opposite_face_connection",
+        "make_partition_connection",
         "make_refinement_connection",
-        "flatten_chained_connection"
+        "flatten_chained_connection",
         ]
 
 __doc__ = """
@@ -73,6 +74,7 @@ __doc__ = """
 .. autofunction:: make_face_to_all_faces_embedding
 
 .. autofunction:: make_opposite_face_connection
+.. autofunction:: make_partition_connection
 
 .. autofunction:: make_refinement_connection
 
diff --git a/meshmode/discretization/connection/face.py b/meshmode/discretization/connection/face.py
index d2745225ab6726399f41a552022df4101eac7da2..6016a146db7b41400dfc995e79005f9b5e089434 100644
--- a/meshmode/discretization/connection/face.py
+++ b/meshmode/discretization/connection/face.py
@@ -88,8 +88,7 @@ def _build_boundary_connection(queue, vol_discr, bdry_discr, connection_data,
                     from_group_index=igrp,
                     from_element_indices=cl.array.to_device(
                         queue,
-                        vol_grp.mesh_el_group.element_nr_base
-                        + data.group_source_element_indices)
+                        data.group_source_element_indices)
                     .with_queue(None),
                     to_element_indices=cl.array.to_device(
                         queue,
diff --git a/meshmode/discretization/connection/opposite_face.py b/meshmode/discretization/connection/opposite_face.py
index 6ce70b2a11b148ce6a1e2cca6f7588ac2549aada..0df3521f8a8632d8275761be5804a9f391f39de1 100644
--- a/meshmode/discretization/connection/opposite_face.py
+++ b/meshmode/discretization/connection/opposite_face.py
@@ -35,143 +35,83 @@ logger = logging.getLogger(__name__)
 
 # {{{ opposite-face connection
 
-def _make_cross_face_batches(
-        queue, vol_discr, bdry_discr,
-        i_tgt_grp, i_src_grp,
-        i_face_tgt,
-        adj_grp,
-        vbc_tgt_grp_face_batch, src_grp_el_lookup):
+def _make_cross_face_batches(queue, tgt_bdry_discr, src_bdry_discr,
+                                    i_tgt_grp, i_src_grp,
+                                    tgt_bdry_element_indices,
+                                    src_bdry_element_indices):
 
-    # {{{ index wrangling
+    # FIXME: This should view-then-transfer
+    # (but PyOpenCL doesn't do non-contiguous transfers for now).
+    tgt_bdry_nodes = (tgt_bdry_discr.groups[i_tgt_grp].view(tgt_bdry_discr.nodes().
+                        get(queue=queue))[:, tgt_bdry_element_indices])
 
-    # Assert that the adjacency group and the restriction
-    # interpolation batch and the adjacency group have the same
-    # element ordering.
+    # FIXME: This should view-then-transfer
+    # (but PyOpenCL doesn't do non-contiguous transfers for now).
+    src_bdry_nodes = (src_bdry_discr.groups[i_src_grp].view(src_bdry_discr.nodes().
+                        get(queue=queue))[:, src_bdry_element_indices])
 
-    adj_grp_tgt_flags = adj_grp.element_faces == i_face_tgt
+    tol = 1e4 * np.finfo(tgt_bdry_nodes.dtype).eps
 
-    assert (
-            np.array_equal(
-                adj_grp.elements[adj_grp_tgt_flags],
-                vbc_tgt_grp_face_batch.from_element_indices
-                .get(queue=queue)))
+    src_mesh_grp = src_bdry_discr.mesh.groups[i_src_grp]
+    src_grp = src_bdry_discr.groups[i_src_grp]
 
-    # find to_element_indices
-
-    to_bdry_element_indices = (
-            vbc_tgt_grp_face_batch.to_element_indices
-            .get(queue=queue))
-
-    # find from_element_indices
-
-    from_vol_element_indices = adj_grp.neighbors[adj_grp_tgt_flags]
-    from_element_faces = adj_grp.neighbor_faces[adj_grp_tgt_flags]
-
-    from_bdry_element_indices = src_grp_el_lookup[
-            from_vol_element_indices, from_element_faces]
-
-    # }}}
-
-    # {{{ visualization (for debugging)
-
-    if 0:
-        print("TVE", adj_grp.elements[adj_grp_tgt_flags])
-        print("TBE", to_bdry_element_indices)
-        print("FVE", from_vol_element_indices)
-        from meshmode.mesh.visualization import draw_2d_mesh
-        import matplotlib.pyplot as pt
-        draw_2d_mesh(vol_discr.mesh, draw_element_numbers=True,
-                set_bounding_box=True,
-                draw_vertex_numbers=False,
-                draw_face_numbers=True,
-                fill=None)
-        pt.figure()
-
-        draw_2d_mesh(bdry_discr.mesh, draw_element_numbers=True,
-                set_bounding_box=True,
-                draw_vertex_numbers=False,
-                draw_face_numbers=True,
-                fill=None)
-
-        pt.show()
-    # }}}
+    dim = src_grp.dim
+    ambient_dim, nelements, ntgt_unit_nodes = tgt_bdry_nodes.shape
+    assert tgt_bdry_nodes.shape == src_bdry_nodes.shape
 
     # {{{ invert face map (using Gauss-Newton)
 
-    to_bdry_nodes = (
-            # FIXME: This should view-then-transfer (but PyOpenCL doesn't do
-            # non-contiguous transfers for now).
-            bdry_discr.groups[i_tgt_grp].view(
-                bdry_discr.nodes().get(queue=queue))
-            [:, to_bdry_element_indices])
-
-    tol = 1e4 * np.finfo(to_bdry_nodes.dtype).eps
-
-    from_mesh_grp = bdry_discr.mesh.groups[i_src_grp]
-    from_grp = bdry_discr.groups[i_src_grp]
-
-    dim = from_grp.dim
-    ambient_dim, nelements, nto_unit_nodes = to_bdry_nodes.shape
-
-    initial_guess = np.mean(from_mesh_grp.vertex_unit_coordinates(), axis=0)
-    from_unit_nodes = np.empty((dim, nelements, nto_unit_nodes))
-    from_unit_nodes[:] = initial_guess.reshape(-1, 1, 1)
+    initial_guess = np.mean(src_mesh_grp.vertex_unit_coordinates(), axis=0)
+    src_unit_nodes = np.empty((dim, nelements, ntgt_unit_nodes))
+    src_unit_nodes[:] = initial_guess.reshape(-1, 1, 1)
 
     import modepy as mp
-    from_vdm = mp.vandermonde(from_grp.basis(), from_grp.unit_nodes)
-    from_inv_t_vdm = la.inv(from_vdm.T)
-    from_nfuncs = len(from_grp.basis())
-
-    # (ambient_dim, nelements, nfrom_unit_nodes)
-    from_bdry_nodes = (
-            # FIXME: This should view-then-transfer (but PyOpenCL doesn't do
-            # non-contiguous transfers for now).
-            bdry_discr.groups[i_src_grp].view(
-                bdry_discr.nodes().get(queue=queue))
-            [:, from_bdry_element_indices])
+    vdm = mp.vandermonde(src_grp.basis(), src_grp.unit_nodes)
+    inv_t_vdm = la.inv(vdm.T)
+    nsrc_funcs = len(src_grp.basis())
 
     def apply_map(unit_nodes):
-        # unit_nodes: (dim, nelements, nto_unit_nodes)
+        # unit_nodes: (dim, nelements, ntgt_unit_nodes)
 
         # basis_at_unit_nodes
-        basis_at_unit_nodes = np.empty((from_nfuncs, nelements, nto_unit_nodes))
+        basis_at_unit_nodes = np.empty((nsrc_funcs, nelements, ntgt_unit_nodes))
 
-        for i, f in enumerate(from_grp.basis()):
+        for i, f in enumerate(src_grp.basis()):
             basis_at_unit_nodes[i] = (
                     f(unit_nodes.reshape(dim, -1))
-                    .reshape(nelements, nto_unit_nodes))
+                    .reshape(nelements, ntgt_unit_nodes))
 
-        intp_coeffs = np.einsum("fj,jet->fet", from_inv_t_vdm, basis_at_unit_nodes)
+        intp_coeffs = np.einsum("fj,jet->fet", inv_t_vdm, basis_at_unit_nodes)
 
         # If we're interpolating 1, we had better get 1 back.
         one_deviation = np.abs(np.sum(intp_coeffs, axis=0) - 1)
         assert (one_deviation < tol).all(), np.max(one_deviation)
 
-        return np.einsum("fet,aef->aet", intp_coeffs, from_bdry_nodes)
+        return np.einsum("fet,aef->aet", intp_coeffs, src_bdry_nodes)
 
     def get_map_jacobian(unit_nodes):
-        # unit_nodes: (dim, nelements, nto_unit_nodes)
+        # unit_nodes: (dim, nelements, ntgt_unit_nodes)
 
         # basis_at_unit_nodes
         dbasis_at_unit_nodes = np.empty(
-                (dim, from_nfuncs, nelements, nto_unit_nodes))
+                (dim, nsrc_funcs, nelements, ntgt_unit_nodes))
 
-        for i, df in enumerate(from_grp.grad_basis()):
+        for i, df in enumerate(src_grp.grad_basis()):
             df_result = df(unit_nodes.reshape(dim, -1))
 
             for rst_axis, df_r in enumerate(df_result):
                 dbasis_at_unit_nodes[rst_axis, i] = (
-                        df_r.reshape(nelements, nto_unit_nodes))
+                        df_r.reshape(nelements, ntgt_unit_nodes))
 
         dintp_coeffs = np.einsum(
-                "fj,rjet->rfet", from_inv_t_vdm, dbasis_at_unit_nodes)
+                "fj,rjet->rfet", inv_t_vdm, dbasis_at_unit_nodes)
 
-        return np.einsum("rfet,aef->raet", dintp_coeffs, from_bdry_nodes)
+        return np.einsum("rfet,aef->raet", dintp_coeffs, src_bdry_nodes)
 
     # {{{ test map applier and jacobian
 
     if 0:
-        u = from_unit_nodes
+        u = src_unit_nodes
         f = apply_map(u)
         for h in [1e-1, 1e-2]:
             du = h*np.random.randn(*u.shape)
@@ -190,16 +130,19 @@ def _make_cross_face_batches(
 
     if 0:
         import matplotlib.pyplot as pt
-        guess = apply_map(from_unit_nodes)
-        goals = to_bdry_nodes
+        guess = apply_map(src_unit_nodes)
+        goals = tgt_bdry_nodes
 
         from meshmode.discretization.visualization import draw_curve
-        draw_curve(bdry_discr)
+        pt.figure(0)
+        draw_curve(tgt_bdry_discr)
+        pt.figure(1)
+        draw_curve(src_bdry_discr)
+        pt.figure(2)
 
         pt.plot(guess[0].reshape(-1), guess[1].reshape(-1), "or")
         pt.plot(goals[0].reshape(-1), goals[1].reshape(-1), "og")
-        pt.plot(from_bdry_nodes[0].reshape(-1), from_bdry_nodes[1].reshape(-1), "o",
-                color="purple")
+        pt.plot(src_bdry_nodes[0].reshape(-1), src_bdry_nodes[1].reshape(-1), "xb")
         pt.show()
 
     # }}}
@@ -208,10 +151,10 @@ def _make_cross_face_batches(
 
     niter = 0
     while True:
-        resid = apply_map(from_unit_nodes) - to_bdry_nodes
+        resid = apply_map(src_unit_nodes) - tgt_bdry_nodes
 
-        df = get_map_jacobian(from_unit_nodes)
-        df_inv_resid = np.empty_like(from_unit_nodes)
+        df = get_map_jacobian(src_unit_nodes)
+        df_inv_resid = np.empty_like(src_unit_nodes)
 
         # For the 1D/2D accelerated versions, we'll use the normal
         # equations and Cramer's rule. If you're looking for high-end
@@ -231,7 +174,7 @@ def _make_cross_face_batches(
 
             det = ata[0, 0]*ata[1, 1] - ata[0, 1]*ata[1, 0]
 
-            df_inv_resid = np.empty_like(from_unit_nodes)
+            df_inv_resid = np.empty_like(src_unit_nodes)
             df_inv_resid[0] = 1/det * (ata[1, 1] * atb[0] - ata[1, 0]*atb[1])
             df_inv_resid[1] = 1/det * (-ata[0, 1] * atb[0] + ata[0, 0]*atb[1])
 
@@ -243,11 +186,24 @@ def _make_cross_face_batches(
             # But we'll only hit it for boundaries of 4+D meshes, in which
             # case... good luck. :)
             for e in range(nelements):
-                for t in range(nto_unit_nodes):
+                for t in range(ntgt_unit_nodes):
                     df_inv_resid[:, e, t], _, _, _ = \
                             la.lstsq(df[:, :, e, t].T, resid[:, e, t])
 
-        from_unit_nodes = from_unit_nodes - df_inv_resid
+        src_unit_nodes = src_unit_nodes - df_inv_resid
+
+        # {{{ visualize next guess
+
+        if 0:
+            import matplotlib.pyplot as pt
+            guess = apply_map(src_unit_nodes)
+            goals = tgt_bdry_nodes
+
+            pt.plot(guess[0].reshape(-1), guess[1].reshape(-1), "rx")
+            pt.plot(goals[0].reshape(-1), goals[1].reshape(-1), "go")
+            pt.show()
+
+        # }}}
 
         max_resid = np.max(np.abs(resid))
         logger.debug("gauss-newton residual: %g" % max_resid)
@@ -264,7 +220,7 @@ def _make_cross_face_batches(
 
     # }}}
 
-    # {{{ find groups of from_unit_nodes
+    # {{{ find groups of src_unit_nodes
 
     def to_dev(ary):
         return cl.array.to_device(queue, ary, array_queue=None)
@@ -275,10 +231,10 @@ def _make_cross_face_batches(
         if not len(todo_elements):
             return
 
-        template_unit_nodes = from_unit_nodes[:, todo_elements[0], :]
+        template_unit_nodes = src_unit_nodes[:, todo_elements[0], :]
 
         unit_node_dist = np.max(np.max(np.abs(
-                from_unit_nodes[:, todo_elements, :]
+                src_unit_nodes[:, todo_elements, :]
                 -
                 template_unit_nodes.reshape(dim, 1, -1)),
                 axis=2), axis=0)
@@ -287,7 +243,7 @@ def _make_cross_face_batches(
         done_elements[close_els] = True
 
         unit_node_dist = np.max(np.max(np.abs(
-                from_unit_nodes[:, todo_elements, :]
+                src_unit_nodes[:, todo_elements, :]
                 -
                 template_unit_nodes.reshape(dim, 1, -1)),
                 axis=2), axis=0)
@@ -295,8 +251,8 @@ def _make_cross_face_batches(
         from meshmode.discretization.connection import InterpolationBatch
         yield InterpolationBatch(
                 from_group_index=i_src_grp,
-                from_element_indices=to_dev(from_bdry_element_indices[close_els]),
-                to_element_indices=to_dev(to_bdry_element_indices[close_els]),
+                from_element_indices=to_dev(src_bdry_element_indices[close_els]),
+                to_element_indices=to_dev(tgt_bdry_element_indices[close_els]),
                 result_unit_nodes=template_unit_nodes,
                 to_element_face=None)
 
@@ -316,7 +272,11 @@ def _find_ibatch_for_face(vbc_tgt_grp_batches, iface):
     return vbc_tgt_grp_face_batch
 
 
-def _make_el_lookup_table(queue, connection, igrp):
+def _make_bdry_el_lookup_table(queue, connection, igrp):
+    """Given a volume-to-boundary connection as *connection*, return
+    a table of shape ``(from_nelements, nfaces)`` to look up the
+    element number of the boundary element for that face.
+    """
     from_nelements = connection.from_discr.groups[igrp].nelements
     from_nfaces = connection.from_discr.mesh.groups[igrp].nfaces
 
@@ -360,25 +320,77 @@ def make_opposite_face_connection(volume_to_bdry_conn):
         groups = [[] for i_tgt_grp in range(ngrps)]
 
         for i_src_grp in range(ngrps):
-            src_grp_el_lookup = _make_el_lookup_table(
+            src_grp_el_lookup = _make_bdry_el_lookup_table(
                     queue, volume_to_bdry_conn, i_src_grp)
 
             for i_tgt_grp in range(ngrps):
                 vbc_tgt_grp_batches = volume_to_bdry_conn.groups[i_tgt_grp].batches
 
-                adj_grp = vol_mesh.facial_adjacency_groups[i_tgt_grp][i_src_grp]
+                adj = vol_mesh.facial_adjacency_groups[i_tgt_grp][i_src_grp]
 
                 for i_face_tgt in range(vol_mesh.groups[i_tgt_grp].nfaces):
                     vbc_tgt_grp_face_batch = _find_ibatch_for_face(
                             vbc_tgt_grp_batches, i_face_tgt)
 
-                    groups[i_tgt_grp].extend(
-                        _make_cross_face_batches(
-                            queue, vol_discr, bdry_discr,
+                    # {{{ index wrangling
+
+                    # Assert that the adjacency group and the restriction
+                    # interpolation batch and the adjacency group have the same
+                    # element ordering.
+
+                    adj_tgt_flags = adj.element_faces == i_face_tgt
+
+                    assert (np.array_equal(
+                                adj.elements[adj_tgt_flags],
+                                vbc_tgt_grp_face_batch.from_element_indices
+                                .get(queue=queue)))
+
+                    # find to_element_indices
+
+                    tgt_bdry_element_indices = (
+                            vbc_tgt_grp_face_batch.to_element_indices
+                            .get(queue=queue))
+
+                    # find from_element_indices
+
+                    src_vol_element_indices = adj.neighbors[adj_tgt_flags]
+                    src_element_faces = adj.neighbor_faces[adj_tgt_flags]
+
+                    src_bdry_element_indices = src_grp_el_lookup[
+                            src_vol_element_indices, src_element_faces]
+
+                    # }}}
+
+                    # {{{ visualization (for debugging)
+
+                    if 0:
+                        print("TVE", adj.elements[adj_tgt_flags])
+                        print("TBE", tgt_bdry_element_indices)
+                        print("FVE", src_vol_element_indices)
+                        from meshmode.mesh.visualization import draw_2d_mesh
+                        import matplotlib.pyplot as pt
+                        draw_2d_mesh(vol_discr.mesh, draw_element_numbers=True,
+                                set_bounding_box=True,
+                                draw_vertex_numbers=False,
+                                draw_face_numbers=True,
+                                fill=None)
+                        pt.figure()
+
+                        draw_2d_mesh(bdry_discr.mesh, draw_element_numbers=True,
+                                set_bounding_box=True,
+                                draw_vertex_numbers=False,
+                                draw_face_numbers=True,
+                                fill=None)
+
+                        pt.show()
+
+                    # }}}
+
+                    groups[i_tgt_grp].extend(_make_cross_face_batches(queue,
+                            bdry_discr, bdry_discr,
                             i_tgt_grp, i_src_grp,
-                            i_face_tgt,
-                            adj_grp,
-                            vbc_tgt_grp_face_batch, src_grp_el_lookup))
+                            tgt_bdry_element_indices,
+                            src_bdry_element_indices))
 
     from meshmode.discretization.connection import (
             DirectDiscretizationConnection, DiscretizationConnectionElementGroup)
@@ -392,4 +404,100 @@ def make_opposite_face_connection(volume_to_bdry_conn):
 
 # }}}
 
+
+# {{{ partition_connection
+
+def make_partition_connection(local_bdry_conn, i_local_part,
+                              remote_bdry, remote_adj_groups,
+                              remote_from_elem_faces, remote_from_elem_indices):
+    """
+    Connects ``local_bdry_conn`` to a neighboring partition.
+
+    :arg local_bdry_conn: A :class:`DiscretizationConnection` of the local
+        partition.
+    :arg i_local_part: The partition number of the local partition.
+    :arg remote_adj_groups: A list of :class:`InterPartitionAdjacency`` of the
+        remote partition.
+    :arg remote_bdry: A :class:`Discretization` of the boundary of the
+        remote partition.
+    :arg remote_from_elem_faces: `remote_from_elem_faces[igrp][idx]` gives the face
+        that batch `idx` interpolates from in group `igrp`.
+    :arg remote_from_elem_indices: `remote_from_elem_indices[igrp][idx]` gives a
+        :class:`np.array` of element indices that batch `idx` interpolates from
+        in group `igrp`.
+
+    :returns: A :class:`DirectDiscretizationConnection` that performs data
+        exchange across faces from the remote partition to partition `i_local_part`.
+
+    .. versionadded:: 2017.1
+
+    .. warning:: Interface is not final.
+    """
+
+    from meshmode.mesh.processing import find_group_indices
+    from meshmode.discretization.connection import (
+            DirectDiscretizationConnection, DiscretizationConnectionElementGroup)
+
+    local_bdry = local_bdry_conn.to_discr
+    local_groups = local_bdry_conn.from_discr.mesh.groups
+
+    part_batches = [[] for _ in local_groups]
+
+    with cl.CommandQueue(local_bdry_conn.cl_context) as queue:
+
+        for i_remote_grp, adj in enumerate(remote_adj_groups):
+            indices = (i_local_part == adj.neighbor_partitions)
+            if not np.any(indices):
+                # Skip because i_remote_grp is not connected to i_local_part.
+                continue
+            i_remote_faces = adj.element_faces[indices]
+            i_local_meshwide_elems = adj.global_neighbors[indices]
+            i_local_faces = adj.neighbor_faces[indices]
+
+            i_local_grps = find_group_indices(local_groups, i_local_meshwide_elems)
+
+            for i_local_grp in np.unique(i_local_grps):
+
+                elem_base = local_groups[i_local_grp].element_nr_base
+                local_el_lookup = _make_bdry_el_lookup_table(queue,
+                                                             local_bdry_conn,
+                                                             i_local_grp)
+
+                for i_remote_face in i_remote_faces:
+
+                    index_flags = np.logical_and(i_local_grps == i_local_grp,
+                                                 i_remote_faces == i_remote_face)
+                    if not np.any(index_flags):
+                        continue
+
+                    remote_bdry_indices = None
+                    for idxs, face in zip(remote_from_elem_indices[i_remote_grp],
+                                          remote_from_elem_faces[i_remote_grp]):
+                        if face == i_remote_face:
+                            remote_bdry_indices = idxs
+                            break
+                    assert remote_bdry_indices is not None
+
+                    elems = i_local_meshwide_elems[index_flags] - elem_base
+                    faces = i_local_faces[index_flags]
+                    local_bdry_indices = local_el_lookup[elems, faces]
+
+                    batches = _make_cross_face_batches(queue,
+                                                       local_bdry, remote_bdry,
+                                                       i_local_grp, i_remote_grp,
+                                                       local_bdry_indices,
+                                                       remote_bdry_indices)
+
+                    part_batches[i_local_grp].extend(batches)
+
+    return DirectDiscretizationConnection(
+            from_discr=remote_bdry,
+            to_discr=local_bdry,
+            groups=[DiscretizationConnectionElementGroup(batches=grp_batches)
+                        for grp_batches in part_batches],
+            is_surjective=True)
+
+# }}}
+
+
 # vim: foldmethod=marker
diff --git a/meshmode/distributed.py b/meshmode/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..529190b55e091b432298f442697e02c9b2786ddc
--- /dev/null
+++ b/meshmode/distributed.py
@@ -0,0 +1,224 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = """
+Copyright (C) 2017 Ellis Hoag
+Copyright (C) 2017 Andreas Kloeckner
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import numpy as np  # noqa
+
+# This file needs to be importable without mpi4py. So don't be tempted to add
+# that import here--push it into individual functions instead.
+
+import logging
+logger = logging.getLogger(__name__)
+
+TAG_BASE = 83411
+TAG_DISTRIBUTE_MESHES = TAG_BASE + 1
+TAG_SEND_BOUNDARY = TAG_BASE + 2
+
+__doc__ = """
+.. autoclass:: MPIMeshDistributor
+.. autoclass:: MPIBoundaryTransceiver
+"""
+
+
+# {{{ mesh distributor
+
+class MPIMeshDistributor(object):
+    """
+    .. automethod:: is_mananger_rank
+    .. automethod:: send_mesh_parts
+    .. automethod:: recv_mesh_part
+    """
+    def __init__(self, mpi_comm, manager_rank=0):
+        """
+        :arg mpi_comm: A :class:`MPI.Intracomm`
+        """
+        self.mpi_comm = mpi_comm
+        self.manager_rank = manager_rank
+
+    def is_mananger_rank(self):
+        return self.mpi_comm.Get_rank() == self.manager_rank
+
+    def send_mesh_parts(self, mesh, part_per_element, num_parts):
+        """
+        :arg mesh: A :class:`Mesh` to distribute to other ranks.
+        :arg part_per_element: A :class:`numpy.ndarray` containing one
+            integer per element of *mesh* indicating which part of the
+            partitioned mesh the element is to become a part of.
+        :arg num_parts: The number of partitions to divide the mesh into.
+
+        Sends each partition to a different rank.
+        Returns one partition that was not sent to any other rank.
+        """
+        mpi_comm = self.mpi_comm
+        rank = mpi_comm.Get_rank()
+        assert num_parts <= mpi_comm.Get_size()
+
+        assert self.is_mananger_rank()
+
+        from meshmode.mesh.processing import partition_mesh
+        parts = [partition_mesh(mesh, part_per_element, i)[0]
+                        for i in range(num_parts)]
+
+        local_part = None
+
+        reqs = []
+        for r, part in enumerate(parts):
+            if r == self.manager_rank:
+                local_part = part
+            else:
+                reqs.append(mpi_comm.isend(part, dest=r, tag=TAG_DISTRIBUTE_MESHES))
+
+        logger.info('rank %d: sent all mesh partitions', rank)
+        for req in reqs:
+            req.wait()
+
+        return local_part
+
+    def receive_mesh_part(self):
+        """
+        Returns the mesh sent by the manager rank.
+        """
+        mpi_comm = self.mpi_comm
+        rank = mpi_comm.Get_rank()
+
+        assert not self.is_mananger_rank(), "Manager rank cannot receive mesh"
+
+        from mpi4py import MPI
+        status = MPI.Status()
+        result = self.mpi_comm.recv(
+                source=self.manager_rank, tag=TAG_DISTRIBUTE_MESHES,
+                status=status)
+        logger.info('rank %d: received local mesh (size = %d)', rank, status.count)
+
+        return result
+
+# }}}
+
+
+# {{{ boundary communication setup helper
+
+class MPIBoundaryCommSetupHelper(object):
+    """
+    .. automethod:: __call__
+    .. automethod:: is_ready
+    """
+    def __init__(self, mpi_comm, queue, local_bdry_conn, i_remote_part,
+            bdry_grp_factory):
+        """
+        :arg mpi_comm: A :class:`MPI.Intracomm`
+        :arg queue:
+        :arg i_remote_part: The part number of the remote partition
+        """
+        self.mpi_comm = mpi_comm
+        self.queue = queue
+        self.i_local_part = mpi_comm.Get_rank()
+        self.i_remote_part = i_remote_part
+        self.local_bdry_conn = local_bdry_conn
+        self.bdry_grp_factory = bdry_grp_factory
+
+    def _post_send_boundary_data(self):
+        local_bdry = self.local_bdry_conn.to_discr
+        local_mesh = self.local_bdry_conn.from_discr.mesh
+        local_adj_groups = [local_mesh.facial_adjacency_groups[i][None]
+                            for i in range(len(local_mesh.groups))]
+        local_batches = [self.local_bdry_conn.groups[i].batches
+                         for i in range(len(local_mesh.groups))]
+        local_to_elem_faces = [[batch.to_element_face for batch in grp_batches]
+                                for grp_batches in local_batches]
+        local_to_elem_indices = [[batch.to_element_indices.get(queue=self.queue)
+                                        for batch in grp_batches]
+                                    for grp_batches in local_batches]
+
+        local_data = {'bdry_mesh': local_bdry.mesh,
+                      'adj': local_adj_groups,
+                      'to_elem_faces': local_to_elem_faces,
+                      'to_elem_indices': local_to_elem_indices}
+        return self.mpi_comm.isend(local_data,
+                                   dest=self.i_remote_part,
+                                   tag=TAG_SEND_BOUNDARY)
+
+    def post_sends(self):
+        logger.info("bdry comm rank %d send begin", self.i_local_part)
+        self.send_req = self._post_send_boundary_data()
+
+    def is_setup_ready(self):
+        """
+        Returns True if the rank boundary data is ready to be received.
+        """
+        return self.mpi_comm.Iprobe(source=self.i_remote_part, tag=TAG_SEND_BOUNDARY)
+
+    def complete_setup(self):
+        """
+        Returns the tuple ``remote_to_local_bdry_conn``
+        where `remote_to_local_bdry_conn` is a
+        :class:`DirectDiscretizationConnection` that gives the connection that
+        performs data exchange across faces from partition `i_remote_part` to the
+        local mesh.
+        """
+        remote_data = self.mpi_comm.recv(
+                source=self.i_remote_part,
+                tag=TAG_SEND_BOUNDARY)
+
+        logger.debug('rank %d: Received rank %d data',
+                     self.i_local_part, self.i_remote_part)
+
+        from meshmode.discretization import Discretization
+        remote_bdry_mesh = remote_data['bdry_mesh']
+        remote_bdry = Discretization(self.queue.context, remote_bdry_mesh,
+                                     self.bdry_grp_factory)
+        remote_adj_groups = remote_data['adj']
+        remote_to_elem_faces = remote_data['to_elem_faces']
+        remote_to_elem_indices = remote_data['to_elem_indices']
+
+        # Connect local_mesh to remote_mesh
+        from meshmode.discretization.connection import make_partition_connection
+        remote_to_local_bdry_conn = make_partition_connection(self.local_bdry_conn,
+                                                              self.i_local_part,
+                                                              remote_bdry,
+                                                              remote_adj_groups,
+                                                              remote_to_elem_faces,
+                                                              remote_to_elem_indices)
+        self.send_req.wait()
+        return remote_to_local_bdry_conn
+
+# }}}
+
+
+def get_connected_partitions(mesh):
+    """
+    :arg mesh: A :class:`Mesh`
+    Returns the set of partition numbers that are connected to `mesh`
+    """
+    connected_parts = set()
+    from meshmode.mesh import InterPartitionAdjacencyGroup
+    for adj in mesh.facial_adjacency_groups:
+        if isinstance(adj[None], InterPartitionAdjacencyGroup):
+            indices = (adj[None].neighbor_partitions >= 0)
+            connected_parts = connected_parts.union(
+                                        adj[None].neighbor_partitions[indices])
+    return connected_parts
+
+# vim: foldmethod=marker
diff --git a/meshmode/mesh/__init__.py b/meshmode/mesh/__init__.py
index 9e374dfb765db8bdbfa579c886a6638a75fb5bc4..8ff2981e9a50e89c409654f81d36d2e6d8de1d13 100644
--- a/meshmode/mesh/__init__.py
+++ b/meshmode/mesh/__init__.py
@@ -54,6 +54,7 @@ Predefined Boundary tags
 .. autoclass:: BTAG_ALL
 .. autoclass:: BTAG_REALLY_ALL
 .. autoclass:: BTAG_NO_BOUNDARY
+.. autoclass:: BTAG_PARTITION
 """
 
 
@@ -88,7 +89,34 @@ class BTAG_NO_BOUNDARY(object):  # noqa
     pass
 
 
-SYSTEM_TAGS = set([BTAG_NONE, BTAG_ALL, BTAG_REALLY_ALL, BTAG_NO_BOUNDARY])
+class BTAG_PARTITION(object):  # noqa
+    """
+    A boundary tag indicating that this edge is adjacent to an element of
+    another :class:`Mesh`. The partition number of the adjacent mesh
+    is given by ``part_nr``.
+
+    .. attribute:: part_nr
+
+    .. versionadded:: 2017.1
+    """
+    def __init__(self, part_nr):
+        self.part_nr = int(part_nr)
+
+    def __hash__(self):
+        return hash((type(self), self.part_nr))
+
+    def __eq__(self, other):
+        if isinstance(other, BTAG_PARTITION):
+            return self.part_nr == other.part_nr
+        else:
+            return False
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+
+SYSTEM_TAGS = set([BTAG_NONE, BTAG_ALL, BTAG_REALLY_ALL, BTAG_NO_BOUNDARY,
+                   BTAG_PARTITION])
 
 # }}}
 
@@ -458,6 +486,82 @@ class FacialAdjacencyGroup(Record):
 # }}}
 
 
+# {{{ partition adjacency
+
+class InterPartitionAdjacencyGroup(FacialAdjacencyGroup):
+    """
+    Describes boundary adjacency information of elements in
+    :class:`MeshElementGroup`.
+
+    .. attribute:: igroup
+
+        The group number of this group.
+
+    .. attribute:: ineighbor_group
+
+        *None* for boundary faces.
+
+    .. attribute:: elements
+
+        Group-local element numbers.
+        Element ``element_id_dtype elements[i]`` and face
+        ``face_id_dtype element_faces[i]`` is connected to neighbor element
+        ``element_id_dtype global_neighbors[i]`` with face
+        ``face_id_dtype global_neighbor_faces[i]``. The partition number it connects
+        to is ``neighbor_partitions[i]``.
+
+    .. attribute:: element_faces
+
+        ``face_id_dtype element_faces[i]`` gives the face of
+        ``element_id_dtype elements[i]`` that is connected to
+        ``global_neighbors[i]``.
+
+    .. attribute:: neighbors
+
+        Since this is a boundary, ``element_id_dtype neighbors[i]`` is interpreted
+        as a boundary tag. ``-neighbors[i]`` should be interpreted according to
+        :class:``Mesh.boundary_tags``.
+
+    .. attribute:: global_neighbors
+
+        Mesh-wide element numbers.
+        ``element_id_dtype global_neighbors[i]`` gives the element number within the
+        neighboring partition of the element connected to
+        ``element_id_dtype elements[i]``. Use ``find_group_instances()`` to find the
+        group that the element belongs to, then subtract ``element_nr_base`` to find
+        the element of the group.
+
+        If ``global_neighbors[i]`` is negative, ``elements[i]`` is on a true
+        boundary and is not connected to any other :class:``Mesh``.
+
+    .. attribute:: neighbor_faces
+
+        ``face_id_dtype global_neighbor_faces[i]`` gives face index within the
+        neighboring partition of the face connected to
+        ``element_id_dtype elements[i]``
+
+        If ``neighbor_partitions[i]`` is negative, ``elements[i]`` is on a true
+        boundary and is not connected to any other :class:``Mesh``.
+
+    .. attribute:: neighbor_partitions
+
+        ``neighbor_partitions[i]`` gives the partition number that ``elements[i]``
+        is connected to.
+
+        If ``neighbor_partitions[i]`` is negative, ``elements[i]`` is on a true
+        boundary and is not connected to any other :class:``Mesh``.
+
+    .. versionadded:: 2017.1
+    """
+
+    def __eq__(self, other):
+        return (super.__eq__(self, other)
+            and np.array_equal(self.global_neighbors, other.global_neighbors)
+            and np.array_equal(self.neighbor_partitions, other.neighbor_partitions))
+
+# }}}
+
+
 # {{{ mesh
 
 class Mesh(Record):
@@ -487,7 +591,8 @@ class Mesh(Record):
         the set of facial adjacency relations between group *igrp*
         and *ineighbor_group*. *ineighbor_group* and *igrp* may be
         identical, or *ineighbor_group* may be *None*, in which case
-        a group containing boundary faces is returned.
+        an :class:``InterPartitionAdjacency`` group containing boundary
+        faces is returned.
 
         Referencing this attribute may raise
         :exc:`meshmode.DataUnavailable`.
@@ -535,6 +640,7 @@ class Mesh(Record):
 
     .. automethod:: __eq__
     .. automethod:: __ne__
+    .. automethos:: adjacency_list
     """
 
     face_id_dtype = np.int8
@@ -578,6 +684,7 @@ class Mesh(Record):
             will result in exceptions. Lastly, a data structure as described in
             :attr:`facial_adjacency_groups` may be passed.
         """
+
         el_nr = 0
         node_nr = 0
 
diff --git a/meshmode/mesh/processing.py b/meshmode/mesh/processing.py
index 36c38eda9d6704f5f59a6e589578d019d7485b88..ea72facf2fea4718d496d2e617a9aff2f6a9cb32 100644
--- a/meshmode/mesh/processing.py
+++ b/meshmode/mesh/processing.py
@@ -31,6 +31,7 @@ import modepy as mp
 
 
 __doc__ = """
+.. autofunction:: find_group_indices
 .. autofunction:: partition_mesh
 .. autofunction:: find_volume_mesh_element_orientations
 .. autofunction:: perform_flips
@@ -41,15 +42,32 @@ __doc__ = """
 """
 
 
+def find_group_indices(groups, meshwide_elems):
+    """
+    :arg groups: A list of :class:``MeshElementGroup`` instances that contain
+        ``meshwide_elems``.
+    :arg meshwide_elems: A :class:``numpy.ndarray`` of mesh-wide element numbers
+        Usually computed by ``elem + element_nr_base``.
+    :returns: A :class:``numpy.ndarray`` of group numbers that ``meshwide_elem``
+        belongs to.
+    """
+    grps = np.zeros_like(meshwide_elems)
+    next_grp_boundary = 0
+    for igrp, grp in enumerate(groups):
+        next_grp_boundary += grp.nelements
+        grps += meshwide_elems >= next_grp_boundary
+    return grps
+
+
 # {{{ partition_mesh
 
-def partition_mesh(mesh, part_per_element, part_nr):
+def partition_mesh(mesh, part_per_element, part_num):
     """
     :arg mesh: A :class:`meshmode.mesh.Mesh` to be partitioned.
     :arg part_per_element: A :class:`numpy.ndarray` containing one
         integer per element of *mesh* indicating which part of the
         partitioned mesh the element is to become a part of.
-    :arg part_nr: The part number of the mesh to return.
+    :arg part_num: The part number of the mesh to return.
 
     :returns: A tuple ``(part_mesh, part_to_global)``, where *part_mesh*
         is a :class:`meshmode.mesh.Mesh` that is a partition of mesh, and
@@ -57,15 +75,12 @@ def partition_mesh(mesh, part_per_element, part_nr):
         numbers on *part_mesh* to ones in *mesh*.
 
     .. versionadded:: 2017.1
-
-    .. warning:: Interface is not final. Connectivity between elements
-        across groups needs to be added.
     """
     assert len(part_per_element) == mesh.nelements, (
         "part_per_element must have shape (mesh.nelements,)")
 
     # Contains the indices of the elements requested.
-    queried_elems = np.where(np.array(part_per_element) == part_nr)[0]
+    queried_elems = np.where(np.array(part_per_element) == part_num)[0]
 
     num_groups = len(mesh.groups)
     new_indices = []
@@ -80,10 +95,10 @@ def partition_mesh(mesh, part_per_element, part_nr):
     skip_groups = []
     num_prev_elems = 0
     start_idx = 0
-    for group_nr in range(num_groups):
-        mesh_group = mesh.groups[group_nr]
+    for group_num in range(num_groups):
+        mesh_group = mesh.groups[group_num]
 
-        # Find the index of first element in the next group
+        # Find the index of first element in the next group.
         end_idx = len(queried_elems)
         for idx in range(start_idx, len(queried_elems)):
             if queried_elems[idx] - num_prev_elems >= mesh_group.nelements:
@@ -91,7 +106,7 @@ def partition_mesh(mesh, part_per_element, part_nr):
                 break
 
         if start_idx == end_idx:
-            skip_groups.append(group_nr)
+            skip_groups.append(group_num)
             new_indices.append(np.array([]))
             new_nodes.append(np.array([]))
             num_prev_elems += mesh_group.nelements
@@ -107,10 +122,10 @@ def partition_mesh(mesh, part_per_element, part_nr):
             for j in range(start_idx, end_idx):
                 elems = queried_elems[j] - num_prev_elems
                 new_idx = j - start_idx
-                new_nodes[group_nr][i, new_idx, :] = mesh_group.nodes[i, elems, :]
+                new_nodes[group_num][i, new_idx, :] = mesh_group.nodes[i, elems, :]
 
-        #index_set = np.append(index_set, new_indices[group_nr].ravel())
-        index_sets = np.append(index_sets, set(new_indices[group_nr].ravel()))
+        #index_set = np.append(index_set, new_indices[group_num].ravel())
+        index_sets = np.append(index_sets, set(new_indices[group_num].ravel()))
 
         num_prev_elems += mesh_group.nelements
         start_idx = end_idx
@@ -124,31 +139,126 @@ def partition_mesh(mesh, part_per_element, part_nr):
         new_vertices[dim] = mesh.vertices[dim][required_indices]
 
     # Our indices need to be in range [0, len(mesh.nelements)].
-    for group_nr in range(num_groups):
-        if group_nr not in skip_groups:
-            for i in range(len(new_indices[group_nr])):
-                for j in range(len(new_indices[group_nr][0])):
-                    original_index = new_indices[group_nr][i, j]
-                    new_indices[group_nr][i, j] = np.where(
-                        required_indices == original_index)[0]
+    for group_num in range(num_groups):
+        if group_num not in skip_groups:
+            for i in range(len(new_indices[group_num])):
+                for j in range(len(new_indices[group_num][0])):
+                    original_index = new_indices[group_num][i, j]
+                    new_indices[group_num][i, j] = np.where(
+                            required_indices == original_index)[0]
 
     new_mesh_groups = []
-    for group_nr in range(num_groups):
-        if group_nr not in skip_groups:
-            mesh_group = mesh.groups[group_nr]
+    for group_num, mesh_group in enumerate(mesh.groups):
+        if group_num not in skip_groups:
             new_mesh_groups.append(
                 type(mesh_group)(
-                    mesh_group.order, new_indices[group_nr],
-                    new_nodes[group_nr],
+                    mesh_group.order, new_indices[group_num],
+                    new_nodes[group_num],
                     unit_nodes=mesh_group.unit_nodes))
 
+    from meshmode.mesh import BTAG_ALL, BTAG_PARTITION
+    boundary_tags = [BTAG_PARTITION(n) for n in np.unique(part_per_element)]
+
     from meshmode.mesh import Mesh
     part_mesh = Mesh(
             new_vertices,
             new_mesh_groups,
+            facial_adjacency_groups=None,
+            boundary_tags=boundary_tags,
             is_conforming=mesh.is_conforming)
 
-    return part_mesh, queried_elems
+    adj_data = [[] for _ in range(len(part_mesh.groups))]
+
+    for igrp, grp in enumerate(part_mesh.groups):
+        elem_base = grp.element_nr_base
+        boundary_adj = part_mesh.facial_adjacency_groups[igrp][None]
+        boundary_elems = boundary_adj.elements
+        boundary_faces = boundary_adj.element_faces
+        p_meshwide_elems = queried_elems[boundary_elems + elem_base]
+        parent_igrps = find_group_indices(mesh.groups, p_meshwide_elems)
+        for adj_idx, elem in enumerate(boundary_elems):
+            face = boundary_faces[adj_idx]
+            tag = -boundary_adj.neighbors[adj_idx]
+            assert tag >= 0, "Expected boundary tag in adjacency group."
+
+            parent_igrp = parent_igrps[adj_idx]
+            parent_elem_base = mesh.groups[parent_igrp].element_nr_base
+            parent_elem = p_meshwide_elems[adj_idx] - parent_elem_base
+
+            parent_adj = mesh.facial_adjacency_groups[parent_igrp]
+
+            for parent_facial_group in parent_adj.values():
+                indices, = np.nonzero(parent_facial_group.elements == parent_elem)
+                for idx in indices:
+                    if (parent_facial_group.neighbors[idx] >= 0 and
+                               parent_facial_group.element_faces[idx] == face):
+                        rank_neighbor = (parent_facial_group.neighbors[idx]
+                                            + parent_elem_base)
+                        n_face = parent_facial_group.neighbor_faces[idx]
+
+                        n_part_num = part_per_element[rank_neighbor]
+                        tag = tag & ~part_mesh.boundary_tag_bit(BTAG_ALL)
+                        tag = tag | part_mesh.boundary_tag_bit(
+                                                    BTAG_PARTITION(n_part_num))
+                        boundary_adj.neighbors[adj_idx] = -tag
+
+                        # Find the neighbor element from the other partition.
+                        n_meshwide_elem = np.count_nonzero(
+                                    part_per_element[:rank_neighbor] == n_part_num)
+
+                        adj_data[igrp].append((elem, face,
+                                               n_part_num, n_meshwide_elem, n_face))
+
+    connected_mesh = part_mesh.copy()
+
+    from meshmode.mesh import InterPartitionAdjacencyGroup
+    for igrp, adj in enumerate(adj_data):
+        if adj:
+            bdry = connected_mesh.facial_adjacency_groups[igrp][None]
+            # Initialize connections
+            n_parts = np.zeros_like(bdry.elements)
+            n_parts.fill(-1)
+            global_n_elems = np.copy(n_parts)
+            n_faces = np.copy(n_parts)
+
+            # Sort both sets of elements so that we can quickly merge
+            # the two data structures
+            bdry_perm = np.lexsort([bdry.element_faces, bdry.elements])
+            elems = bdry.elements[bdry_perm]
+            faces = bdry.element_faces[bdry_perm]
+            neighbors = bdry.neighbors[bdry_perm]
+            adj_elems, adj_faces, adj_n_parts, adj_gl_n_elems, adj_n_faces =\
+                                    np.array(adj).T
+            adj_perm = np.lexsort([adj_faces, adj_elems])
+            adj_elems = adj_elems[adj_perm]
+            adj_faces = adj_faces[adj_perm]
+            adj_n_parts = adj_n_parts[adj_perm]
+            adj_gl_n_elems = adj_gl_n_elems[adj_perm]
+            adj_n_faces = adj_n_faces[adj_perm]
+
+            # Merge interpartition adjacency data with FacialAdjacencyGroup
+            adj_idx = 0
+            for bdry_idx in range(len(elems)):
+                if adj_idx >= len(adj_elems):
+                    break
+                if (adj_elems[adj_idx] == elems[bdry_idx]
+                        and adj_faces[adj_idx] == faces[bdry_idx]):
+                    n_parts[bdry_idx] = adj_n_parts[adj_idx]
+                    global_n_elems[bdry_idx] = adj_gl_n_elems[adj_idx]
+                    n_faces[bdry_idx] = adj_n_faces[adj_idx]
+                    adj_idx += 1
+
+            connected_mesh.facial_adjacency_groups[igrp][None] =\
+                    InterPartitionAdjacencyGroup(elements=elems,
+                                                 element_faces=faces,
+                                                 neighbors=neighbors,
+                                                 igroup=bdry.igroup,
+                                                 ineighbor_group=None,
+                                                 neighbor_partitions=n_parts,
+                                                 global_neighbors=global_n_elems,
+                                                 neighbor_faces=n_faces)
+
+    return connected_mesh, queried_elems
 
 # }}}
 
@@ -391,6 +501,8 @@ def merge_disjoint_meshes(meshes, skip_tests=False, single_group=False):
         grp_cls = None
         order = None
         unit_nodes = None
+        nodal_adjacency = None
+        facial_adjacency_groups = None
 
         for mesh in meshes:
             if mesh._nodal_adjacency is not None:
@@ -426,6 +538,8 @@ def merge_disjoint_meshes(meshes, skip_tests=False, single_group=False):
 
     else:
         new_groups = []
+        nodal_adjacency = None
+        facial_adjacency_groups = None
 
         for mesh, vert_base in zip(meshes, vert_bases):
             if mesh._nodal_adjacency is not None:
diff --git a/test/test_meshmode.py b/test/test_meshmode.py
index e97d8efa036f378272b7f94588f1e867265d9a35..bd775dfc8a682455790df02b9327f54cffe36535 100644
--- a/test/test_meshmode.py
+++ b/test/test_meshmode.py
@@ -49,56 +49,6 @@ import logging
 logger = logging.getLogger(__name__)
 
 
-# {{{ partition_mesh
-
-def test_partition_torus_mesh():
-    from meshmode.mesh.generation import generate_torus
-    my_mesh = generate_torus(2, 1, n_outer=2, n_inner=2)
-
-    part_per_element = np.array([0, 1, 2, 1, 1, 2, 1, 0])
-
-    from meshmode.mesh.processing import partition_mesh
-    (part_mesh0, _) = partition_mesh(my_mesh, part_per_element, 0)
-    (part_mesh1, _) = partition_mesh(my_mesh, part_per_element, 1)
-    (part_mesh2, _) = partition_mesh(my_mesh, part_per_element, 2)
-
-    assert part_mesh0.nelements == 2
-    assert part_mesh1.nelements == 4
-    assert part_mesh2.nelements == 2
-
-
-def test_partition_boxes_mesh():
-    pymetis = pytest.importorskip('pymetis')
-
-    n = 5
-    num_parts = 7
-    from meshmode.mesh.generation import generate_regular_rect_mesh
-    mesh1 = generate_regular_rect_mesh(a=(0, 0, 0), b=(1, 1, 1), n=(n, n, n))
-    mesh2 = generate_regular_rect_mesh(a=(2, 2, 2), b=(3, 3, 3), n=(n, n, n))
-
-    from meshmode.mesh.processing import merge_disjoint_meshes
-    mesh = merge_disjoint_meshes([mesh1, mesh2])
-
-    adjacency_list = np.zeros((mesh.nelements,), dtype=set)
-    for elem in range(mesh.nelements):
-        adjacency_list[elem] = set()
-        starts = mesh.nodal_adjacency.neighbors_starts
-        for n in range(starts[elem], starts[elem + 1]):
-            adjacency_list[elem].add(mesh.nodal_adjacency.neighbors[n])
-
-    (_, p) = pymetis.part_graph(num_parts, adjacency=adjacency_list)
-    part_per_element = np.array(p)
-
-    from meshmode.mesh.processing import partition_mesh
-    new_meshes = [
-        partition_mesh(mesh, part_per_element, i)[0] for i in range(num_parts)]
-
-    assert mesh.nelements == np.sum(
-        [new_meshes[i].nelements for i in range(num_parts)])
-
-# }}}
-
-
 # {{{ circle mesh
 
 def test_circle_mesh(do_plot=False):
@@ -143,9 +93,9 @@ def test_circle_mesh(do_plot=False):
     ("warp", 3, [10, 20, 30]),
     ])
 @pytest.mark.parametrize("per_face_groups", [False, True])
-def test_boundary_interpolation(ctx_getter, group_factory, boundary_tag,
+def test_boundary_interpolation(ctx_factory, group_factory, boundary_tag,
         mesh_name, dim, mesh_pars, per_face_groups):
-    cl_ctx = ctx_getter()
+    cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
 
     from meshmode.discretization import Discretization
@@ -231,9 +181,9 @@ def test_boundary_interpolation(ctx_getter, group_factory, boundary_tag,
     ("warp", 3, [10, 20, 30]),
     ])
 @pytest.mark.parametrize("per_face_groups", [False, True])
-def test_all_faces_interpolation(ctx_getter, mesh_name, dim, mesh_pars,
+def test_all_faces_interpolation(ctx_factory, mesh_name, dim, mesh_pars,
         per_face_groups):
-    cl_ctx = ctx_getter()
+    cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
 
     from meshmode.discretization import Discretization
@@ -343,11 +293,11 @@ def test_all_faces_interpolation(ctx_getter, mesh_name, dim, mesh_pars,
     ("warp", 2, [3, 5, 7]),
     ("warp", 3, [3, 5]),
     ])
-def test_opposite_face_interpolation(ctx_getter, group_factory,
+def test_opposite_face_interpolation(ctx_factory, group_factory,
         mesh_name, dim, mesh_pars):
     logging.basicConfig(level=logging.INFO)
 
-    cl_ctx = ctx_getter()
+    cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
 
     from meshmode.discretization import Discretization
@@ -459,12 +409,12 @@ def test_element_orientation():
     ("ball", lambda: mgen.generate_icosahedron(1, 1)),
     ("torus", lambda: mgen.generate_torus(5, 1)),
     ])
-def test_3d_orientation(ctx_getter, what, mesh_gen_func, visualize=False):
+def test_3d_orientation(ctx_factory, what, mesh_gen_func, visualize=False):
     pytest.importorskip("pytential")
 
     logging.basicConfig(level=logging.INFO)
 
-    ctx = ctx_getter()
+    ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
     mesh = mesh_gen_func()
@@ -514,7 +464,7 @@ def test_3d_orientation(ctx_getter, what, mesh_gen_func, visualize=False):
 
 # {{{ merge and map
 
-def test_merge_and_map(ctx_getter, visualize=False):
+def test_merge_and_map(ctx_factory, visualize=False):
     from meshmode.mesh.io import generate_gmsh, FileSource
     from meshmode.mesh.generation import generate_box_mesh
     from meshmode.mesh import TensorProductElementGroup
@@ -555,7 +505,7 @@ def test_merge_and_map(ctx_getter, visualize=False):
 
     if visualize:
         from meshmode.discretization import Discretization
-        cl_ctx = ctx_getter()
+        cl_ctx = ctx_factory()
         queue = cl.CommandQueue(cl_ctx)
 
         discr = Discretization(cl_ctx, mesh3, discr_grp_factory)
@@ -571,10 +521,10 @@ def test_merge_and_map(ctx_getter, visualize=False):
 
 @pytest.mark.parametrize("dim", [2, 3])
 @pytest.mark.parametrize("order", [1, 3])
-def test_sanity_single_element(ctx_getter, dim, order, visualize=False):
+def test_sanity_single_element(ctx_factory, dim, order, visualize=False):
     pytest.importorskip("pytential")
 
-    cl_ctx = ctx_getter()
+    cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
 
     from modepy.tools import unit_vertices
@@ -658,12 +608,12 @@ def test_sanity_single_element(ctx_getter, dim, order, visualize=False):
 
 @pytest.mark.parametrize("dim", [2, 3, 4])
 @pytest.mark.parametrize("order", [3])
-def test_sanity_qhull_nd(ctx_getter, dim, order):
+def test_sanity_qhull_nd(ctx_factory, dim, order):
     pytest.importorskip("scipy")
 
     logging.basicConfig(level=logging.INFO)
 
-    ctx = ctx_getter()
+    ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
     from scipy.spatial import Delaunay
@@ -712,13 +662,13 @@ def test_sanity_qhull_nd(ctx_getter, dim, order):
     ("ball-radius-1.step", 3),
     ])
 @pytest.mark.parametrize("mesh_order", [1, 2])
-def test_sanity_balls(ctx_getter, src_file, dim, mesh_order,
+def test_sanity_balls(ctx_factory, src_file, dim, mesh_order,
         visualize=False):
     pytest.importorskip("pytential")
 
     logging.basicConfig(level=logging.INFO)
 
-    ctx = ctx_getter()
+    ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
     from pytools.convergence import EOCRecorder
@@ -839,7 +789,7 @@ def test_rect_mesh(do_plot=False):
         pt.show()
 
 
-def test_box_mesh(ctx_getter, visualize=False):
+def test_box_mesh(ctx_factory, visualize=False):
     from meshmode.mesh.generation import generate_box_mesh
     mesh = generate_box_mesh(3*(np.linspace(0, 1, 5),))
 
@@ -847,7 +797,7 @@ def test_box_mesh(ctx_getter, visualize=False):
         from meshmode.discretization import Discretization
         from meshmode.discretization.poly_element import \
                 PolynomialWarpAndBlendGroupFactory
-        cl_ctx = ctx_getter()
+        cl_ctx = ctx_factory()
         queue = cl.CommandQueue(cl_ctx)
 
         discr = Discretization(cl_ctx, mesh,
@@ -996,6 +946,8 @@ def no_test_quad_mesh_3d():
 # }}}
 
 
+# {{{ test_quad_single_element
+
 def test_quad_single_element():
     from meshmode.mesh.generation import make_group_from_vertices
     from meshmode.mesh import Mesh, TensorProductElementGroup
@@ -1019,6 +971,10 @@ def test_quad_single_element():
                 mg.nodes[1].reshape(-1), "o")
         plt.show()
 
+# }}}
+
+
+# {{{ test_quad_multi_element
 
 def test_quad_multi_element():
     from meshmode.mesh.generation import generate_box_mesh
@@ -1039,6 +995,8 @@ def test_quad_multi_element():
                 mg.nodes[1].reshape(-1), "o")
         plt.show()
 
+# }}}
+
 
 def test_vtk_overwrite(ctx_getter):
     pytest.importorskip("pyvisfile")
diff --git a/test/test_partition.py b/test/test_partition.py
new file mode 100644
index 0000000000000000000000000000000000000000..78258f4f1a0becdba402e32f68b05f0d5fa52ac0
--- /dev/null
+++ b/test/test_partition.py
@@ -0,0 +1,559 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = """
+Copyright (C) 2017 Ellis Hoag
+Copyright (C) 2017 Andreas Kloeckner
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from six.moves import range
+import numpy as np
+import numpy.linalg as la
+import pyopencl as cl
+import pyopencl.array  # noqa
+import pyopencl.clmath  # noqa
+
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl
+        as pytest_generate_tests)
+
+from meshmode.discretization.poly_element import (
+        PolynomialWarpAndBlendGroupFactory)
+from meshmode.mesh import BTAG_ALL
+
+import pytest
+import os
+
+import logging
+logger = logging.getLogger(__name__)
+
+# Is there a smart way of choosing this number?
+# Currenly it is the same as the base from MPIBoundaryTransceiver
+TAG_BASE = 83411
+TAG_SEND_REMOTE_NODES = TAG_BASE + 3
+TAG_SEND_LOCAL_NODES = TAG_BASE + 4
+
+
+# {{{ partition_interpolation
+
+@pytest.mark.parametrize("num_parts", [2, 3])
+@pytest.mark.parametrize("num_groups", [1, 2])
+@pytest.mark.parametrize("scramble_partitions", [False])
+@pytest.mark.parametrize(("dim", "mesh_pars"),
+        [
+         (2, [3, 4, 7]),
+         (3, [3, 4])
+        ])
+def test_partition_interpolation(ctx_factory, dim, mesh_pars,
+                                 num_parts, num_groups, scramble_partitions):
+    np.random.seed(42)
+    group_factory = PolynomialWarpAndBlendGroupFactory
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+    order = 4
+
+    from pytools.convergence import EOCRecorder
+    eoc_rec = dict()
+    for i in range(num_parts):
+        for j in range(num_parts):
+            if i == j:
+                continue
+            eoc_rec[i, j] = EOCRecorder()
+
+    def f(x):
+        return 10.*cl.clmath.sin(50.*x)
+
+    for n in mesh_pars:
+        from meshmode.mesh.generation import generate_warped_rect_mesh
+        meshes = [generate_warped_rect_mesh(dim, order=order, n=n)
+                                for _ in range(num_groups)]
+
+        if num_groups > 1:
+            from meshmode.mesh.processing import merge_disjoint_meshes
+            mesh = merge_disjoint_meshes(meshes)
+        else:
+            mesh = meshes[0]
+
+        if scramble_partitions:
+            part_per_element = np.random.randint(num_parts, size=mesh.nelements)
+        else:
+            from pymetis import part_graph
+            _, p = part_graph(num_parts,
+                              xadj=mesh.nodal_adjacency.neighbors_starts.tolist(),
+                              adjncy=mesh.nodal_adjacency.neighbors.tolist())
+            part_per_element = np.array(p)
+
+        from meshmode.mesh.processing import partition_mesh
+        part_meshes = [
+            partition_mesh(mesh, part_per_element, i)[0] for i in range(num_parts)]
+
+        from meshmode.discretization import Discretization
+        vol_discrs = [Discretization(cl_ctx, part_meshes[i], group_factory(order))
+                        for i in range(num_parts)]
+
+        from meshmode.mesh import BTAG_PARTITION
+        from meshmode.discretization.connection import (make_face_restriction,
+                                                        make_partition_connection,
+                                                        check_connection)
+
+        for i_local_part, i_remote_part in eoc_rec.keys():
+            if eoc_rec[i_local_part, i_remote_part] is None:
+                continue
+
+            # Mark faces within local_mesh that are connected to remote_mesh
+            local_bdry_conn = make_face_restriction(vol_discrs[i_local_part],
+                                                    group_factory(order),
+                                                    BTAG_PARTITION(i_remote_part))
+
+            # If these parts are not connected, don't bother checking the error
+            bdry_nodes = local_bdry_conn.to_discr.nodes()
+            if bdry_nodes.size == 0:
+                eoc_rec[i_local_part, i_remote_part] = None
+                continue
+
+            # Mark faces within remote_mesh that are connected to local_mesh
+            remote_bdry_conn = make_face_restriction(vol_discrs[i_remote_part],
+                                                     group_factory(order),
+                                                     BTAG_PARTITION(i_local_part))
+
+            assert bdry_nodes.size == remote_bdry_conn.to_discr.nodes().size, \
+                        "partitions do not have the same number of connected nodes"
+
+            # Gather just enough information for the connection
+            local_bdry = local_bdry_conn.to_discr
+            local_mesh = part_meshes[i_local_part]
+            local_adj_groups = [local_mesh.facial_adjacency_groups[i][None]
+                                for i in range(len(local_mesh.groups))]
+            local_batches = [local_bdry_conn.groups[i].batches
+                                for i in range(len(local_mesh.groups))]
+            local_from_elem_faces = [[batch.to_element_face
+                                            for batch in grp_batches]
+                                        for grp_batches in local_batches]
+            local_from_elem_indices = [[batch.to_element_indices.get(queue=queue)
+                                            for batch in grp_batches]
+                                        for grp_batches in local_batches]
+
+            remote_bdry = remote_bdry_conn.to_discr
+            remote_mesh = part_meshes[i_remote_part]
+            remote_adj_groups = [remote_mesh.facial_adjacency_groups[i][None]
+                                for i in range(len(remote_mesh.groups))]
+            remote_batches = [remote_bdry_conn.groups[i].batches
+                                for i in range(len(remote_mesh.groups))]
+            remote_from_elem_faces = [[batch.to_element_face
+                                            for batch in grp_batches]
+                                        for grp_batches in remote_batches]
+            remote_from_elem_indices = [[batch.to_element_indices.get(queue=queue)
+                                            for batch in grp_batches]
+                                        for grp_batches in remote_batches]
+
+            # Connect from remote_mesh to local_mesh
+            remote_to_local_conn = make_partition_connection(local_bdry_conn,
+                                                             i_local_part,
+                                                             remote_bdry,
+                                                             remote_adj_groups,
+                                                             remote_from_elem_faces,
+                                                            remote_from_elem_indices)
+            # Connect from local mesh to remote mesh
+            local_to_remote_conn = make_partition_connection(remote_bdry_conn,
+                                                             i_remote_part,
+                                                             local_bdry,
+                                                             local_adj_groups,
+                                                             local_from_elem_faces,
+                                                             local_from_elem_indices)
+            check_connection(remote_to_local_conn)
+            check_connection(local_to_remote_conn)
+
+            true_local_points = f(local_bdry.nodes()[0].with_queue(queue))
+            remote_points = local_to_remote_conn(queue, true_local_points)
+            local_points = remote_to_local_conn(queue, remote_points)
+
+            err = la.norm((true_local_points - local_points).get(), np.inf)
+            eoc_rec[i_local_part, i_remote_part].add_data_point(1./n, err)
+
+    for (i, j), e in eoc_rec.items():
+        if e is not None:
+            print("Error of connection from part %i to part %i." % (i, j))
+            print(e)
+            assert(e.order_estimate() >= order - 0.5 or e.max_error() < 1e-11)
+
+# }}}
+
+
+# {{{ partition_mesh
+
+@pytest.mark.parametrize("dim", [2, 3])
+@pytest.mark.parametrize("num_parts", [4, 5, 7])
+@pytest.mark.parametrize("num_meshes", [1, 2, 7])
+@pytest.mark.parametrize("scramble_partitions", [True, False])
+def test_partition_mesh(num_parts, num_meshes, dim, scramble_partitions):
+    np.random.seed(42)
+    n = (5,) * dim
+    from meshmode.mesh.generation import generate_regular_rect_mesh
+    meshes = [generate_regular_rect_mesh(a=(0 + i,) * dim, b=(1 + i,) * dim, n=n)
+                        for i in range(num_meshes)]
+
+    from meshmode.mesh.processing import merge_disjoint_meshes
+    mesh = merge_disjoint_meshes(meshes)
+
+    if scramble_partitions:
+        part_per_element = np.random.randint(num_parts, size=mesh.nelements)
+    else:
+        pymetis = pytest.importorskip('pymetis')
+
+        _, p = pymetis.part_graph(num_parts,
+                          xadj=mesh.nodal_adjacency.neighbors_starts.tolist(),
+                          adjncy=mesh.nodal_adjacency.neighbors.tolist())
+        part_per_element = np.array(p)
+
+    from meshmode.mesh.processing import partition_mesh
+    # TODO: The same part_per_element array must be used to partition each mesh.
+    # Maybe the interface should be changed to guarantee this.
+    new_meshes = [
+        partition_mesh(mesh, part_per_element, i) for i in range(num_parts)]
+
+    assert mesh.nelements == np.sum(
+        [new_meshes[i][0].nelements for i in range(num_parts)]), \
+        "part_mesh has the wrong number of elements"
+
+    assert count_tags(mesh, BTAG_ALL) == np.sum(
+        [count_tags(new_meshes[i][0], BTAG_ALL) for i in range(num_parts)]), \
+        "part_mesh has the wrong number of BTAG_ALL boundaries"
+
+    from meshmode.mesh import BTAG_PARTITION, InterPartitionAdjacencyGroup
+    from meshmode.mesh.processing import find_group_indices
+    num_tags = np.zeros((num_parts,))
+
+    index_lookup_table = dict()
+    for ipart, (m, _) in enumerate(new_meshes):
+        for igrp in range(len(m.groups)):
+            adj = m.facial_adjacency_groups[igrp][None]
+            if not isinstance(adj, InterPartitionAdjacencyGroup):
+                # This group is not connected to another partition.
+                continue
+            for i, (elem, face) in enumerate(zip(adj.elements, adj.element_faces)):
+                index_lookup_table[ipart, igrp, elem, face] = i
+
+    for part_num in range(num_parts):
+        part, part_to_global = new_meshes[part_num]
+        for grp_num in range(len(part.groups)):
+            adj = part.facial_adjacency_groups[grp_num][None]
+            tags = -part.facial_adjacency_groups[grp_num][None].neighbors
+            assert np.all(tags >= 0)
+            if not isinstance(adj, InterPartitionAdjacencyGroup):
+                # This group is not connected to another partition.
+                continue
+            elem_base = part.groups[grp_num].element_nr_base
+            for idx in range(len(adj.elements)):
+                if adj.global_neighbors[idx] == -1:
+                    continue
+                elem = adj.elements[idx]
+                face = adj.element_faces[idx]
+                n_part_num = adj.neighbor_partitions[idx]
+                n_meshwide_elem = adj.global_neighbors[idx]
+                n_face = adj.neighbor_faces[idx]
+                num_tags[n_part_num] += 1
+                n_part, n_part_to_global = new_meshes[n_part_num]
+                # Hack: find_igrps expects a numpy.ndarray and returns
+                #       a numpy.ndarray. But if a single integer is fed
+                #       into find_igrps, an integer is returned.
+                n_grp_num = int(find_group_indices(n_part.groups, n_meshwide_elem))
+                n_adj = n_part.facial_adjacency_groups[n_grp_num][None]
+                n_elem_base = n_part.groups[n_grp_num].element_nr_base
+                n_elem = n_meshwide_elem - n_elem_base
+                n_idx = index_lookup_table[n_part_num, n_grp_num, n_elem, n_face]
+                assert (part_num == n_adj.neighbor_partitions[n_idx]
+                        and elem + elem_base == n_adj.global_neighbors[n_idx]
+                        and face == n_adj.neighbor_faces[n_idx]),\
+                        "InterPartitionAdjacencyGroup is not consistent"
+                _, n_part_to_global = new_meshes[n_part_num]
+                p_meshwide_elem = part_to_global[elem + elem_base]
+                p_meshwide_n_elem = n_part_to_global[n_elem + n_elem_base]
+
+                p_grp_num = find_group_indices(mesh.groups, p_meshwide_elem)
+                p_n_grp_num = find_group_indices(mesh.groups, p_meshwide_n_elem)
+
+                p_elem_base = mesh.groups[p_grp_num].element_nr_base
+                p_n_elem_base = mesh.groups[p_n_grp_num].element_nr_base
+                p_elem = p_meshwide_elem - p_elem_base
+                p_n_elem = p_meshwide_n_elem - p_n_elem_base
+
+                f_groups = mesh.facial_adjacency_groups[p_grp_num]
+                for p_bnd_adj in f_groups.values():
+                    for idx in range(len(p_bnd_adj.elements)):
+                        if (p_elem == p_bnd_adj.elements[idx] and
+                                 face == p_bnd_adj.element_faces[idx]):
+                            assert p_n_elem == p_bnd_adj.neighbors[idx],\
+                                    "Tag does not give correct neighbor"
+                            assert n_face == p_bnd_adj.neighbor_faces[idx],\
+                                    "Tag does not give correct neighbor"
+
+    for i_tag in range(num_parts):
+        tag_sum = 0
+        for mesh, _ in new_meshes:
+            tag_sum += count_tags(mesh, BTAG_PARTITION(i_tag))
+        assert num_tags[i_tag] == tag_sum,\
+                "part_mesh has the wrong number of BTAG_PARTITION boundaries"
+
+
+def count_tags(mesh, tag):
+    num_bnds = 0
+    for adj_dict in mesh.facial_adjacency_groups:
+        for neighbors in adj_dict[None].neighbors:
+            if neighbors < 0:
+                if -neighbors & mesh.boundary_tag_bit(tag) != 0:
+                    num_bnds += 1
+    return num_bnds
+
+# }}}
+
+
+# {{{ MPI test boundary swap
+
+def _test_mpi_boundary_swap(dim, order, num_groups):
+    from meshmode.distributed import MPIMeshDistributor, MPIBoundaryCommSetupHelper
+
+    from mpi4py import MPI
+    mpi_comm = MPI.COMM_WORLD
+    i_local_part = mpi_comm.Get_rank()
+    num_parts = mpi_comm.Get_size()
+
+    mesh_dist = MPIMeshDistributor(mpi_comm)
+
+    if mesh_dist.is_mananger_rank():
+        np.random.seed(42)
+        from meshmode.mesh.generation import generate_warped_rect_mesh
+        meshes = [generate_warped_rect_mesh(dim, order=order, n=4)
+                        for _ in range(num_groups)]
+
+        if num_groups > 1:
+            from meshmode.mesh.processing import merge_disjoint_meshes
+            mesh = merge_disjoint_meshes(meshes)
+        else:
+            mesh = meshes[0]
+
+        part_per_element = np.random.randint(num_parts, size=mesh.nelements)
+
+        local_mesh = mesh_dist.send_mesh_parts(mesh, part_per_element, num_parts)
+    else:
+        local_mesh = mesh_dist.receive_mesh_part()
+
+    group_factory = PolynomialWarpAndBlendGroupFactory(order)
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+
+    from meshmode.discretization import Discretization
+    vol_discr = Discretization(cl_ctx, local_mesh, group_factory)
+
+    from meshmode.distributed import get_connected_partitions
+    connected_parts = get_connected_partitions(local_mesh)
+    assert i_local_part not in connected_parts
+    bdry_setup_helpers = {}
+    local_bdry_conns = {}
+
+    from meshmode.discretization.connection import make_face_restriction
+    from meshmode.mesh import BTAG_PARTITION
+    for i_remote_part in connected_parts:
+        local_bdry_conns[i_remote_part] = make_face_restriction(
+                vol_discr, group_factory, BTAG_PARTITION(i_remote_part))
+
+        setup_helper = bdry_setup_helpers[i_remote_part] = \
+                MPIBoundaryCommSetupHelper(
+                        mpi_comm, queue, local_bdry_conns[i_remote_part],
+                        i_remote_part, bdry_grp_factory=group_factory)
+
+        setup_helper.post_sends()
+
+    remote_to_local_bdry_conns = {}
+    from meshmode.discretization.connection import check_connection
+    while bdry_setup_helpers:
+        for i_remote_part, setup_helper in bdry_setup_helpers.items():
+            if setup_helper.is_setup_ready():
+                assert bdry_setup_helpers.pop(i_remote_part) is setup_helper
+                conn = setup_helper.complete_setup()
+                check_connection(conn)
+                remote_to_local_bdry_conns[i_remote_part] = conn
+                break
+
+        # FIXME: Not ideal, busy-waits
+
+    _test_data_transfer(mpi_comm,
+                        queue,
+                        local_bdry_conns,
+                        remote_to_local_bdry_conns,
+                        connected_parts)
+
+    logger.debug("Rank %d exiting", i_local_part)
+
+
+# TODO
+def _test_data_transfer(mpi_comm, queue, local_bdry_conns,
+                        remote_to_local_bdry_conns, connected_parts):
+    from mpi4py import MPI
+
+    def f(x):
+        return 10*cl.clmath.sin(20.*x)
+
+    '''
+    Here is a simplified example of what happens from
+    the point of view of the local rank.
+
+    Local rank:
+        1. Transfer local points from local boundary to remote boundary
+            to get remote points.
+        2. Send remote points to remote rank.
+    Remote rank:
+        3. Receive remote points from local rank.
+        4. Transfer remote points from remote boundary to local boundary
+            to get local points.
+        5. Send local points to local rank.
+    Local rank:
+        6. Receive local points from remote rank.
+        7. Check if local points are the same as the original local points.
+    '''
+
+    # 1.
+    send_reqs = []
+    for i_remote_part in connected_parts:
+        conn = remote_to_local_bdry_conns[i_remote_part]
+        bdry_discr = local_bdry_conns[i_remote_part].to_discr
+        bdry_x = bdry_discr.nodes()[0].with_queue(queue=queue)
+
+        true_local_f = f(bdry_x)
+        remote_f = conn(queue, true_local_f)
+
+        # 2.
+        send_reqs.append(mpi_comm.isend(remote_f.get(queue=queue),
+                                        dest=i_remote_part,
+                                        tag=TAG_SEND_REMOTE_NODES))
+
+    # 3.
+    buffers = {}
+    for i_remote_part in connected_parts:
+        status = MPI.Status()
+        mpi_comm.probe(source=i_remote_part,
+                       tag=TAG_SEND_REMOTE_NODES,
+                       status=status)
+        buffers[i_remote_part] = np.empty(status.count, dtype=bytes)
+
+    recv_reqs = {}
+    for i_remote_part, buf in buffers.items():
+        recv_reqs[i_remote_part] = mpi_comm.irecv(buf=buf,
+                                                  source=i_remote_part,
+                                                  tag=TAG_SEND_REMOTE_NODES)
+    remote_to_local_f_data = {}
+    for i_remote_part, req in recv_reqs.items():
+        remote_to_local_f_data[i_remote_part] = req.wait()
+        buffers[i_remote_part] = None   # free buffer
+
+    for req in send_reqs:
+        req.wait()
+
+    # 4.
+    send_reqs = []
+    for i_remote_part in connected_parts:
+        conn = remote_to_local_bdry_conns[i_remote_part]
+        local_f_np = remote_to_local_f_data[i_remote_part]
+        local_f_cl = cl.array.Array(queue,
+                                    shape=local_f_np.shape,
+                                    dtype=local_f_np.dtype)
+        local_f_cl.set(local_f_np)
+        remote_f = conn(queue, local_f_cl).get(queue=queue)
+
+        # 5.
+        send_reqs.append(mpi_comm.isend(remote_f,
+                                        dest=i_remote_part,
+                                        tag=TAG_SEND_LOCAL_NODES))
+
+    # 6.
+    buffers = {}
+    for i_remote_part in connected_parts:
+        status = MPI.Status()
+        mpi_comm.probe(source=i_remote_part,
+                       tag=TAG_SEND_LOCAL_NODES,
+                       status=status)
+        buffers[i_remote_part] = np.empty(status.count, dtype=bytes)
+
+    recv_reqs = {}
+    for i_remote_part, buf in buffers.items():
+        recv_reqs[i_remote_part] = mpi_comm.irecv(buf=buf,
+                                                  source=i_remote_part,
+                                                  tag=TAG_SEND_LOCAL_NODES)
+    local_f_data = {}
+    for i_remote_part, req in recv_reqs.items():
+        local_f_data[i_remote_part] = req.wait()
+        buffers[i_remote_part] = None   # free buffer
+
+    for req in send_reqs:
+        req.wait()
+
+    # 7.
+    for i_remote_part in connected_parts:
+        bdry_discr = local_bdry_conns[i_remote_part].to_discr
+        bdry_x = bdry_discr.nodes()[0].with_queue(queue=queue)
+
+        true_local_f = f(bdry_x).get(queue=queue)
+        local_f = local_f_data[i_remote_part]
+
+        from numpy.linalg import norm
+        err = norm(true_local_f - local_f, np.inf)
+        assert err < 1e-11, "Error = %f is too large" % err
+
+# }}}
+
+
+# {{{ MPI pytest entrypoint
+
+@pytest.mark.mpi
+@pytest.mark.parametrize("num_partitions", [3, 4])
+@pytest.mark.parametrize("order", [2, 3])
+def test_mpi_communication(num_partitions, order):
+    pytest.importorskip("mpi4py")
+
+    num_ranks = num_partitions
+    from subprocess import check_call
+    import sys
+    newenv = os.environ.copy()
+    newenv["RUN_WITHIN_MPI"] = "1"
+    newenv["order"] = str(order)
+    check_call([
+        "mpiexec", "-np", str(num_ranks), "-x", "RUN_WITHIN_MPI",
+        sys.executable, __file__],
+        env=newenv)
+
+# }}}
+
+
+if __name__ == "__main__":
+    if "RUN_WITHIN_MPI" in os.environ:
+        dim = 2
+        order = int(os.environ["order"])
+        num_groups = 2
+        _test_mpi_boundary_swap(dim, order, num_groups)
+    else:
+        import sys
+        if len(sys.argv) > 1:
+            exec(sys.argv[1])
+        else:
+            from py.test.cmdline import main
+            main([__file__])
+
+# vim: fdm=marker