diff --git a/examples/acoustics3d.lpy b/examples/acoustics3d.lpy
new file mode 100644
index 0000000000000000000000000000000000000000..919566deeaf65b7ee3e13eaac38bfaaf9c75ab2c
--- /dev/null
+++ b/examples/acoustics3d.lpy
@@ -0,0 +1,86 @@
+% ------------------------------------------------------
+% VOLUME KERNEL
+% input: u,v,w,p are float, Np x K 
+% input: DrDsDt is float4, Np x Np 
+% input: dRdx, dRdy, dRdz are float4, Np x K
+% output: rhsu, rhsv, rhsw, rhsp, Np x K 
+
+% loop domain: 1<= n,m <= Np, 1<= e <= K
+
+% BODY >>> 
+
+% reduction on m (assume DrDsDt is float4 for convenience)
+dudR = DrDsDt(n,m)*u(m,e)
+dvdR = DrDsDt(n,m)*v(m,e)
+dwdR = DrDsDt(n,m)*w(m,e)
+dpdR = DrDsDt(n,m)*p(m,e)
+
+% volume flux 
+rhsu(n,e) = dot4(dRdx(k),dpdR)
+rhsv(n,e) = dot4(dRdy(k),dpdR)
+rhsw(n,e) = dot4(dRdz(k),dpdR)
+rhsp(n,e) = dot4(dRdx(k), dudR) + dot4(dRdy(k), dvdR) + dot4(dRdz(k), dwdR)
+
+% BODY <<<
+% ------------------------------------------------------
+
+% ------------------------------------------------------
+% SURFACE KERNEL
+
+% input: u,v,w,p are float, Np x K 
+% input: LIFT is float, Np x (Nfp*Nfaces) 
+% input: nx,ny,nz,Fscale are float, (Nfp*Nfaces) x K
+% input/output: rhsu, rhsv, rhsw, rhsp are float Np x K
+
+% loop-domain 1<= m <= Nfp*Nfaces, 1<= n <= Np, 1<= e <= K
+
+% BODY >>> 
+% find surface node indices at both traces
+idP = vmapP(m,e)
+idM = vmapM(m,e)
+
+% can we bounce to single index (row/column major is important)
+% can we use this indexing here for clarity ?
+du = u(idP)-u(idM)
+dv = v(idP)-v(idM)
+dw = w(idP)-w(idM)
+dp = bc(idM)*p(idP) - p(idM)
+
+dQ = 0.5*Fscale(m,e)*(dp - nx(m,e)*du - ny(m,e)*dv - nz(m,e)*dw)
+
+fluxu = -nx(m,e)*dQ
+fluxv = -ny(m,e)*dQ
+fluxw = -nz(m,e)*dQ
+fluxp =          dQ
+
+% reduction here
+rhsu(n,e) += LIFT(n,m)*fluxu
+rhsv(n,e) += LIFT(n,m)*fluxv
+rhsw(n,e) += LIFT(n,m)*fluxw
+rhsp(n,e) += LIFT(n,m)*fluxp
+% BODY <<<
+% ------------------------------------------------------
+
+% ------------------------------------------------------
+% RK kernel here
+
+% input/output: u,v,w,p are float, Np*K 
+% input/output: resu,resv,resw,resp are float, Np*K 
+% input parameters rk4a, rk4b, dt
+
+% 1 <= n <= K*Np, 
+
+% BODY >>> 
+
+resu(n) = rk4a*resu(n) + dt*rhsu
+resv(n) = rk4a*resv(n) + dt*rhsv
+resw(n) = rk4a*resw(n) + dt*rhsw
+resp(n) = rk4a*resp(n) + dt*rhsp
+
+u(n) += rk4b*resu(n)
+v(n) += rk4b*resv(n)
+w(n) += rk4b*resw(n)
+p(n) += rk4b*resp(n)
+
+% BODY <<<
+% ------------------------------------------------------
\ No newline at end of file
diff --git a/loopy/compiled.py b/loopy/compiled.py
index 1eebc5952a7ec15ef5f625011664bc77a764a946..187c17050ee482ac2f0eeeac2f809a0d15f7d9bf 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -42,16 +42,26 @@ def _arg_matches_spec(arg, val, other_args):
 class CompiledKernel:
     def __init__(self, context, kernel, size_args=None, options=[],
              edit_code=False, codegen_kwargs={}):
+        """
+        :arg kernel: may be a loopy.LoopKernel, a generator returning kernels
+          (a warning will be issued if more than one is returned). If the kernel
+          has not yet been loop-scheduled, that is done, too, with no specific
+          arguments.
+        """
         import loopy as lp
 
         # {{{ do scheduling, if not yet done
 
         needs_check = False
 
-        if kernel.schedule is None:
+        if not isinstance(kernel, lp.LoopKernel) or kernel.schedule is None:
+            if isinstance(kernel, lp.LoopKernel):
+                # kernel-iterable, really
+                kernel = lp.generate_loop_schedules(kernel)
+
             kernel_count = 0
 
-            for scheduled_kernel in lp.generate_loop_schedules(kernel):
+            for scheduled_kernel in kernel:
                 kernel_count += 1
 
                 if kernel_count == 1: