diff --git a/meshmode/discretization/connection/opposite_face.py b/meshmode/discretization/connection/opposite_face.py
index 3546c963f345b11e2b56ba309627ec65f8b1cfae..4558730c16c4bdde40e2b83a853bdad7a9d8dcaa 100644
--- a/meshmode/discretization/connection/opposite_face.py
+++ b/meshmode/discretization/connection/opposite_face.py
@@ -393,8 +393,153 @@ def make_opposite_face_connection(volume_to_bdry_conn):
 # }}}
 
 
-def _make_cross_partition_batches():
-    return [42]
+def _make_cross_partition_batch(queue, vol_to_bdry_conns, adj, i_src_part, i_src_grp, from_elem, from_face):
+
+    (i_tgt_part, i_tgt_grp, bdry_elem, bdry_face) = adj.get_neighbor(from_elem, from_face)
+
+    src_bdry_discr = vol_to_bdry_conns[i_tgt_part].to_discr
+    tgt_bdry_discr = vol_to_bdry_conns[i_scr_part].to_discr
+
+    to_bdry_nodes = (
+            # FIXME: This should view-then-transfer (but PyOpenCL doesn't do
+            # non-contiguous transfers for now).
+            tgt_bdry_discr.groups[i_tgt_grp].view(
+                tgt_bdry_discr.nodes().get(queue=queue))
+            [:, to_bdry_element_indices])
+
+    tol = 1e4 * np.finfo(to_bdry_nodes.dtype).eps
+
+    # TODO: Should this use vol_discr?
+    from_mesh_grp = src_bdry_discr.mesh.groups[i_src_grp]
+    from_grp = src_bdry_discr.groups[i_src_grp]
+
+    dim = from_grp.dim
+    ambient_dim, nelements, nto_unit_nodes = to_bdry_nodes.shape
+
+    initial_guess = np.mean(from_mesh_grp.vertex_unit_coordinates(), axis=0)
+
+    from_unit_nodes = np.empty((dim, nelements, nto_unit_nodes))
+    from_unit_nodes[:] = initial_guess.reshape(-1, 1, 1)
+
+    import modepy as mp
+    from_vdm = mp.vandermonde(from_grp.basis(), from_grp.unit_nodes)
+    from_inv_t_vdm = la.inv(from_vdm.T)
+    from_nfuncs = len(from_grp.basis())
+
+    # (ambient_dim, nelements, nfrom_unit_nodes)
+    from_bdry_nodes = (
+            # FIXME: This should view-then-transfer (but PyOpenCL doesn't do
+            # non-contiguous transfers for now).
+            # TODO: Should this be vol_discr?
+            bdry_discr.groups[i_src_grp].view(
+                src_bdry_discr.nodes().get(queue=queue))
+            [:, from_bdry_element_indices])
+
+    def apply_map(unit_nodes):
+        # unit_nodes: (dim, nelements, nto_unit_nodes)
+        # basis_at_unit_nodes
+        basis_at_unit_nodes = np.empty((from_nfuncs, nelements, nto_unit_nodes))
+        for i, f in enumerate(from_grp.basis()):
+            basis_at_unit_nodes[i] = (
+                    f(unit_nodes.reshape(dim, -1))
+                    .reshape(nelements, nto_unit_nodes))
+        intp_coeffs = np.einsum("fj,jet->fet", from_inv_t_vdm, basis_at_unit_nodes)
+        # If we're interpolating 1, we had better get 1 back.
+        one_deviation = np.abs(np.sum(intp_coeffs, axis=0) - 1)
+        assert (one_deviation < tol).all(), np.max(one_deviation)
+        return np.einsum("fet,aef->aet", intp_coeffs, from_bdry_nodes)
+
+    def get_map_jacobian(unit_nodes):
+        # unit_nodes: (dim, nelements, nto_unit_nodes)
+        # basis_at_unit_nodes
+        dbasis_at_unit_nodes = np.empty(
+                (dim, from_nfuncs, nelements, nto_unit_nodes))
+        for i, df in enumerate(from_grp.grad_basis()):
+            df_result = df(unit_nodes.reshape(dim, -1))
+            for rst_axis, df_r in enumerate(df_result):
+                dbasis_at_unit_nodes[rst_axis, i] = (
+                        df_r.reshape(nelements, nto_unit_nodes))
+        dintp_coeffs = np.einsum(
+                "fj,rjet->rfet", from_inv_t_vdm, dbasis_at_unit_nodes)
+        return np.einsum("rfet,aef->raet", dintp_coeffs, from_bdry_nodes)
+
+    logger.info("make_opposite_face_connection: begin gauss-newton")
+    niter = 0
+    while True:
+        resid = apply_map(from_unit_nodes) - to_bdry_nodes
+        df = get_map_jacobian(from_unit_nodes)
+        df_inv_resid = np.empty_like(from_unit_nodes)
+        # For the 1D/2D accelerated versions, we'll use the normal
+        # equations and Cramer's rule. If you're looking for high-end
+        # numerics, look no further than meshmode.
+        if dim == 1:
+            # A is df.T
+            ata = np.einsum("iket,jket->ijet", df, df)
+            atb = np.einsum("iket,ket->iet", df, resid)
+            df_inv_resid = atb / ata[0, 0]
+        elif dim == 2:
+            # A is df.T
+            ata = np.einsum("iket,jket->ijet", df, df)
+            atb = np.einsum("iket,ket->iet", df, resid)
+            det = ata[0, 0]*ata[1, 1] - ata[0, 1]*ata[1, 0]
+            df_inv_resid = np.empty_like(from_unit_nodes)
+            df_inv_resid[0] = 1/det * (ata[1, 1] * atb[0] - ata[1, 0]*atb[1])
+            df_inv_resid[1] = 1/det * (-ata[0, 1] * atb[0] + ata[0, 0]*atb[1])
+        else:
+            # The boundary of a 3D mesh is 2D, so that's the
+            # highest-dimensional case we genuinely care about.
+            #
+            # This stinks, performance-wise, because it's not vectorized.
+            # But we'll only hit it for boundaries of 4+D meshes, in which
+            # case... good luck. :)
+            for e in range(nelements):
+                for t in range(nto_unit_nodes):
+                    df_inv_resid[:, e, t], _, _, _ = \
+                            la.lstsq(df[:, :, e, t].T, resid[:, e, t])
+        from_unit_nodes = from_unit_nodes - df_inv_resid
+        max_resid = np.max(np.abs(resid))
+        logger.debug("gauss-newton residual: %g" % max_resid)
+        if max_resid < tol:
+            logger.info("make_opposite_face_connection: gauss-newton: done, "
+                    "final residual: %g" % max_resid)
+            break
+        niter += 1
+        if niter > 10:
+            raise RuntimeError("Gauss-Newton (for finding opposite-face reference "
+                    "coordinates) did not converge")
+  
+    def to_dev(ary):
+        return cl.array.to_device(queue, ary, array_queue=None)
+
+    done_elements = np.zeros(nelements, dtype=np.bool)
+
+    # TODO: Still need to figure out what's happening here.
+    while True:
+        todo_elements, = np.where(~done_elements)
+        if not len(todo_elements):
+            return
+        template_unit_nodes = from_unit_nodes[:, todo_elements[0], :]
+        unit_node_dist = np.max(np.max(np.abs(
+                from_unit_nodes[:, todo_elements, :]
+                -
+                template_unit_nodes.reshape(dim, 1, -1)),
+                axis=2), axis=0)
+        close_els = todo_elements[unit_node_dist < tol]
+        done_elements[close_els] = True
+        unit_node_dist = np.max(np.max(np.abs(
+                from_unit_nodes[:, todo_elements, :]
+                -
+                template_unit_nodes.reshape(dim, 1, -1)),
+                axis=2), axis=0)
+
+        from meshmode.discretization.connection import InterpolationBatch
+        yield InterpolationBatch(
+                from_group_index=i_src_grp,
+                from_element_indices=to_dev(from_bdry_element_indices[close_els]),
+                to_element_indices=to_dev(to_bdry_element_indices[close_els]),
+                result_unit_nodes=template_unit_nodes,
+                to_element_face=None)
+
 
 def make_partition_connection(vol_to_bdry_conns):
     """
@@ -418,25 +563,28 @@ def make_partition_connection(vol_to_bdry_conns):
     with cl.CommandQueue(cl_context) as queue:
         # Create a list of batches. Each batch contains interpolation
         #   data from one partition to another.
-        for src_part_idx in range(nparts):
-            src_vol_conn = vol_to_bdry_conns[src_part_idx]
-            src_from_discr = src_vol_conn.from_discr
-            src_to_discr = src_vol_conn.to_discr
+        for i_src_part, src_vol_conn in enumerate(vol_to_bdry_conns):
             src_mesh = src_from_discr.mesh
             ngroups = len(src_mesh.groups)
             part_batches = [[] for _ in range(ngroups)]
             for group_num, adj in enumerate(src_mesh.interpart_adj_groups):
                 for elem_idx, elem in enumerate(adj.elements):
                     face = adj.element_faces[elem_idx]
-                    (part_idx, group_num, n_elem, n_face) =\
-                                        adj.get_neighbor(elem, face)
 
-                    # We need to create batches using the 
+                    # We need to create a batch using the 
                     # neighboring face, element, and group
                     # I'm not sure how I would do this.
                     # My guess is that it would look
                     # something like _make_cross_face_batches
-                    part_batches[group_num].extend(_make_cross_partition_batches())
+                    part_batches[group_num].append(
+                            _make_cross_partition_batch(
+                                queue,
+                                vol_to_bdry_conns,
+                                adj,
+                                i_src_part,
+                                group_num,
+                                elem,
+                                face))
 
             # Make one Discr connection for each partition.
             disc_conns.append(DirectDiscretizationConnection(