diff --git a/examples/advection/surface.py b/examples/advection/surface.py
index 69ab3a90c336bbaf360cecfea74af4c172a6e391..2ef5d770b8cad7fb25cf12e8cfdc3bae46496e36 100644
--- a/examples/advection/surface.py
+++ b/examples/advection/surface.py
@@ -98,12 +98,12 @@ def main(ctx_factory, dim=2, order=4, product_tag=None, visualize=True):
     # sphere radius
     radius = 1.0
     # sphere resolution
-    resolution = 64 if dim == 2 else 1
+    resolution = 64 if dim == 2 else 1.0
 
     # cfl
     dt_factor = 1.0
     # final time
-    final_time = 1.0
+    final_time = 2.0 * np.pi
 
     # velocity field
     sym_x = sym.nodes(dim)
@@ -112,7 +112,7 @@ def main(ctx_factory, dim=2, order=4, product_tag=None, visualize=True):
         ])[:dim]
     norm_c = sym.sqrt((c**2).sum())
     # flux
-    flux_type = "central"
+    flux_type = "lf"
 
     # }}}
 
@@ -125,9 +125,29 @@ def main(ctx_factory, dim=2, order=4, product_tag=None, visualize=True):
                 np.linspace(0.0, 1.0, resolution + 1),
                 order)
     elif dim == 3:
-        from meshmode.mesh.generation import generate_icosphere
-        return generate_icosphere(radius, order=order,
-                uniform_refinement_rounds=resolution)
+        if 0:
+            from meshmode.mesh.generation import generate_icosphere
+            mesh = generate_icosphere(radius, order=4 * order)
+            from meshmode.mesh.refinement import refine_uniformly
+            mesh = refine_uniformly(mesh, resolution, with_adjacency=False)
+        else:
+            from meshmode.mesh.io import ScriptSource
+            source = ScriptSource("""
+                SetFactory("OpenCASCADE");
+                Sphere(1) = {0, 0, 0, %g};
+                Mesh.Algorithm = 6;
+                """ % radius, "geo")
+
+            from meshmode.mesh.io import generate_gmsh
+            mesh = generate_gmsh(source, 2, order=order,
+                    other_options=[
+                        "-optimize_ho",
+                        "-string", "Mesh.CharacteristicLengthMax = %g;" % resolution
+                        ],
+                    target_unit="MM")
+
+            from meshmode.mesh.processing import perform_flips
+            mesh = perform_flips(mesh, np.ones(mesh.nelements))
     else:
         raise ValueError("unsupported dimension")
 
@@ -146,10 +166,15 @@ def main(ctx_factory, dim=2, order=4, product_tag=None, visualize=True):
     discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order,
             quad_tag_to_group_factory=quad_tag_to_group_factory)
 
+    volume_discr = discr.discr_from_dd(sym.DD_VOLUME)
+    logger.info("nnodes:    %d", volume_discr.nnodes)
+    logger.info("nelements: %d", volume_discr.mesh.nelements)
+
     # }}}
 
     # {{{ symbolic operators
 
+
     def f_gaussian(x):
         return x[0]
 
@@ -164,6 +189,11 @@ def main(ctx_factory, dim=2, order=4, product_tag=None, visualize=True):
     def rhs(t, u):
         return bound_op(queue, t=t, u=u)
 
+    # check velocity is tangential
+    sym_normal = sym.surface_normal(dim, dim=dim - 1, dd=sym.DD_VOLUME).as_vector()
+    error = la.norm(bind(discr, c.dot(sym_normal))(queue).get(queue))
+    logger.info("u_dot_n:   %.5e", error)
+
     # }}}
 
     # {{{ time stepping
@@ -175,23 +205,51 @@ def main(ctx_factory, dim=2, order=4, product_tag=None, visualize=True):
     nsteps = int(final_time // dt) + 1
     dt = final_time/nsteps + 1.0e-15
 
+    logger.info("dt:        %.5e", dt)
+    logger.info("nsteps:    %d", nsteps)
+
     from grudge.shortcuts import set_up_rk4
     dt_stepper = set_up_rk4("u", dt, u, rhs)
     plot = Plotter(queue, discr, order, visualize=visualize)
 
     norm = bind(discr, sym.norm(2, sym.var("u")))
+    norm_u = 0.0
 
     step = 0
-    norm_u = 0.0
-    for event in dt_stepper.run(t_end=final_time):
+
+    event = dt_stepper.StateComputed(0.0, 0, 0, u)
+    plot(event, "fld-surface-%04d" % 0)
+
+    if visualize and dim == 3:
+        from grudge.shortcuts import make_visualizer
+        vis = make_visualizer(discr, vis_order=order)
+        vis.write_vtk_file("fld-surface-velocity.vtu", [
+            ("u", bind(discr, c)(queue)),
+            ("n", bind(discr, sym_normal)(queue))
+            ], overwrite=True)
+
+        df = sym.DOFDesc(sym.FACE_RESTR_INTERIOR)
+        face_discr = discr.connection_from_dds(sym.DD_VOLUME, df).to_discr
+
+        face_normal = bind(discr, sym.normal(
+            df, face_discr.ambient_dim, dim=face_discr.dim))(queue)
+
+        from meshmode.discretization.visualization import make_visualizer
+        vis = make_visualizer(queue, face_discr, vis_order=order,
+                use_high_order_vtk=False)
+        vis.write_vtk_file("fld-surface-face-normals.vtu", [
+            ("n", face_normal)
+            ], overwrite=True)
+
+    for event in dt_stepper.run(t_end=final_time, max_steps=nsteps + 1):
         if not isinstance(event, dt_stepper.StateComputed):
             continue
 
+        step += 1
         if step % 1 == 0:
             norm_u = norm(queue, u=event.state_component)
             plot(event, "fld-surface-%04d" % step)
 
-        step += 1
         logger.info("[%04d] t = %.5f |u| = %.5e", step, event.t, norm_u)
 
     # }}}
@@ -201,8 +259,8 @@ if __name__ == "__main__":
     import argparse
 
     parser = argparse.ArgumentParser()
-    parser.add_argument("--dim", default=2, type=int)
-    parser.add_argument("--qtag", default="none")
+    parser.add_argument("--dim", choices=[2, 3], default=2, type=int)
+    parser.add_argument("--qtag", choices=["none", "product"], default="none")
     args = parser.parse_args()
 
     logging.basicConfig(level=logging.INFO)
diff --git a/grudge/models/advection.py b/grudge/models/advection.py
index ae3d8a33fda2fcd92320f1c239fc4c4b0473c7d7..3891c9ca755fc8863334afb510f5c7a1a513eb5b 100644
--- a/grudge/models/advection.py
+++ b/grudge/models/advection.py
@@ -189,24 +189,35 @@ def v_dot_n_tpair(velocity, dd=None):
     ambient_dim = len(velocity)
     normal = sym.normal(dd, ambient_dim, dim=ambient_dim - 2)
 
-    v_dot_n_i = sym.cse(velocity.dot(normal), "v_dot_normal")
+    v_dot_n_i = sym.cse(velocity.dot(normal))
     v_dot_n_e = sym.cse(sym.OppositeInteriorFaceSwap()(v_dot_n_i))
 
-    return sym.TracePair(dd, v_dot_n_i, -v_dot_n_e)
+    return sym.TracePair(dd, v_dot_n_i, v_dot_n_e)
 
 
 def surface_advection_weak_flux(flux_type, u, velocity):
     v_dot_n = v_dot_n_tpair(velocity, dd=u.dd)
+    v_dot_n = sym.cse(0.5 * v_dot_n.jump, "v_dot_normal")
 
     flux_type = flux_type.lower()
     if flux_type == "central":
-        return u.avg * v_dot_n.avg
+        return u.avg * v_dot_n
     elif flux_type == "lf":
-        return u.avg * v_dot_n.avg + 0.5 * sym.fabs(v_dot_n) * (u.int - u.ext)
+        return u.avg * v_dot_n + 0.5 * sym.fabs(v_dot_n) * (u.int - u.ext)
     else:
         raise ValueError("flux `{}` is not implemented".format(flux_type))
 
 
+def surface_penalty_flux(u, velocity, tau=1.0):
+    if abs(tau) < 1.0e-14:
+        return 0.0
+
+    v_dot_n = v_dot_n_tpair(velocity, dd=u.dd)
+    return sym.If(sym.Comparison(v_dot_n.avg, ">", 0),
+            0.5 * tau * u.avg * v_dot_n.int,
+            0.0)
+
+
 class SurfaceAdvectionOperator(AdvectionOperatorBase):
     def __init__(self, v, flux_type="central", quad_tag=None):
         super(SurfaceAdvectionOperator, self).__init__(
@@ -217,12 +228,19 @@ class SurfaceAdvectionOperator(AdvectionOperatorBase):
         surf_v = sym.interp(sym.DD_VOLUME, u.dd)(self.v)
         return surface_advection_weak_flux(self.flux_type, u, surf_v)
 
+    def penalty(self, u):
+        surf_v = sym.interp(sym.DD_VOLUME, u.dd)(self.v)
+        return surface_penalty_flux(u, surf_v, tau=0.0)
+
     def sym_operator(self):
         u = sym.var("u")
 
         def flux(pair):
             return sym.interp(pair.dd, face_dd)(self.flux(pair))
 
+        def penalty(pair):
+            return sym.interp(pair.dd, face_dd)(self.penalty(pair))
+
         face_dd = sym.DOFDesc(sym.FACE_RESTR_ALL, self.quad_tag)
 
         quad_dd = sym.DOFDesc(sym.DTAG_VOLUME_ALL, self.quad_tag)
@@ -238,6 +256,7 @@ class SurfaceAdvectionOperator(AdvectionOperatorBase):
                     for n in range(self.ambient_dim))
                 - sym.FaceMassOperator(face_dd, sym.DD_VOLUME)(
                     flux(sym.int_tpair(u, self.quad_tag))
+                    + penalty(sym.int_tpair(u, self.quad_tag))
                     )
                 )
 
diff --git a/grudge/symbolic/primitives.py b/grudge/symbolic/primitives.py
index ec75087e595cf58c75fc760f3d6876e3b4a656a4..ceef7a1066ceb61bebff0fdf5c96801e6e2bc0f7 100644
--- a/grudge/symbolic/primitives.py
+++ b/grudge/symbolic/primitives.py
@@ -858,6 +858,10 @@ class TracePair:
     def ext(self):
         return self.exterior
 
+    @property
+    def jump(self):
+        return self.int - self.ext
+
     @property
     def avg(self):
         return 0.5*(self.int + self.ext)