diff --git a/sumpy/__init__.py b/sumpy/__init__.py
index bb1c7069487881b5ba1836bb4c4759080a8a8640..04bdb31823c9da64b15ded4beffb3d761a2de598 100644
--- a/sumpy/__init__.py
+++ b/sumpy/__init__.py
@@ -22,6 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+import os
 from sumpy.p2p import P2P, P2PFromCSR
 from sumpy.p2e import P2EFromSingleBox, P2EFromCSR
 from sumpy.e2p import E2PFromSingleBox, E2PFromCSR
@@ -36,13 +37,18 @@ __all__ = [
     "E2EFromCSR", "E2EFromChildren", "E2EFromParent"]
 
 
-code_cache = PersistentDict("sumpy-code-cache-v4-"+VERSION_TEXT)
+code_cache = PersistentDict("sumpy-code-cache-v5-"+VERSION_TEXT)
 
 
 # {{{ cache control
 
 CACHING_ENABLED = True
 
+CACHING_ENABLED = (
+    "SUMPY_NO_CACHE" not in os.environ
+    and
+    "CG_NO_CACHE" not in os.environ)
+
 
 def set_caching_enabled(flag):
     """Set whether :mod:`loopy` is allowed to use disk caching for its various
@@ -53,7 +59,7 @@ def set_caching_enabled(flag):
 
 
 class CacheMode(object):
-    """A context manager for setting whether :mod:`loopy` is allowed to use
+    """A context manager for setting whether :mod:`sumpy` is allowed to use
     disk caches.
     """
 
diff --git a/sumpy/codegen.py b/sumpy/codegen.py
index f23822828409eba0e007094e777bf4bc36f2837a..ac746fcc091084f0ea1d324729d2cac9a466284a 100644
--- a/sumpy/codegen.py
+++ b/sumpy/codegen.py
@@ -71,6 +71,26 @@ BESSEL_PREAMBLE = """//CL//
 #include <pyopencl-bessel-j.cl>
 #include <pyopencl-bessel-y.cl>
 #include <pyopencl-bessel-j-complex.cl>
+
+typedef struct bessel_j_two_result_str
+{
+    cdouble_t jv, jvp1;
+} bessel_j_two_result;
+
+bessel_j_two_result bessel_jv_two(int v, double z)
+{
+    bessel_j_two_result result;
+    result.jv = cdouble_fromreal(bessel_jv(v, z));
+    result.jvp1 = cdouble_fromreal(bessel_jv(v+1, z));
+    return result;
+}
+
+bessel_j_two_result bessel_jv_two_complex(int v, cdouble_t z)
+{
+    bessel_j_two_result result;
+    bessel_j_complex(v, z, &result.jv, &result.jvp1);
+    return result;
+}
 """
 
 HANKEL_PREAMBLE = """//CL//
@@ -81,11 +101,11 @@ typedef struct hank1_01_result_str
     cdouble_t order0, order1;
 } hank1_01_result;
 
-hank1_01_result hank1_01(cdouble_t z)
+hank1_01_result hank1_01(double z)
 {
     hank1_01_result result;
-    result.order0 = cdouble_new(bessel_j0(z.x), bessel_y0(z.x));
-    result.order1 = cdouble_new(bessel_j1(z.x), bessel_y1(z.x));
+    result.order0 = cdouble_new(bessel_j0(z), bessel_y0(z));
+    result.order1 = cdouble_new(bessel_j1(z), bessel_y1(z));
     return result;
 }
 
@@ -107,9 +127,11 @@ def bessel_preamble_generator(target, seen_dtypes, seen_functions):
     if any(func.name == "hank1_01" for func in seen_functions):
         yield ("50-sumpy-hankel", HANKEL_PREAMBLE)
         require_bessel = True
-    if require_bessel or any(func.name == "bessel_jv" for func in seen_functions):
+    if (require_bessel
+            or any(func.name == "bessel_jv_two" for func in seen_functions)):
         yield ("40-sumpy-bessel", BESSEL_PREAMBLE)
 
+
 hank1_01_result_dtype = cl.tools.get_or_register_dtype("hank1_01_result",
         np.dtype([
             ("order0", np.complex128),
@@ -117,6 +139,13 @@ hank1_01_result_dtype = cl.tools.get_or_register_dtype("hank1_01_result",
             ]),
         )
 
+bessel_j_two_result_dtype = cl.tools.get_or_register_dtype("bessel_j_two_result",
+        np.dtype([
+            ("jv", np.complex128),
+            ("jvp1", np.complex128),
+            ]),
+        )
+
 
 def bessel_mangler(target, identifier, arg_dtypes):
     """A function "mangler" to make Bessel functions
@@ -132,11 +161,21 @@ def bessel_mangler(target, identifier, arg_dtypes):
     if identifier == "hank1_01":
         if arg_dtypes[0].kind == "c":
             identifier = "hank1_01_complex"
+            return (np.dtype(hank1_01_result_dtype),
+                    identifier, (np.dtype(np.complex128),))
+        else:
+            return (np.dtype(hank1_01_result_dtype),
+                    identifier, (np.dtype(np.float64),))
+
+    elif identifier == "bessel_jv_two":
+        if arg_dtypes[1].kind == "c":
+            identifier = "bessel_jv_two_complex"
+            return (np.dtype(bessel_j_two_result_dtype),
+                    identifier, (np.dtype(np.int32), np.dtype(np.complex128),))
+        else:
+            return (np.dtype(bessel_j_two_result_dtype),
+                    identifier, (np.dtype(np.int32), np.dtype(np.float64),))
 
-        return (np.dtype(hank1_01_result_dtype),
-                identifier, (np.dtype(np.complex128),))
-    elif identifier == "bessel_jv":
-        return np.dtype(np.float64), identifier
     else:
         return None
 
@@ -150,8 +189,8 @@ class BesselGetter(object):
         return prim.Variable("hank1_01")(arg)
 
     @memoize_method
-    def bessel_j_impl(self, order, arg):
-        return prim.Variable("bessel_jv")(order, arg)
+    def bessel_jv_two(self, order, arg):
+        return prim.Variable("bessel_jv_two")(order, arg)
 
     @memoize_method
     def hankel_1(self, order, arg):
@@ -183,14 +222,14 @@ class BesselGetter(object):
     def bessel_j(self, order, arg):
         top_order = self.bessel_j_arg_to_top_order[arg]
 
+        bessel_two = (
+                prim.CommonSubexpression(self.bessel_jv_two(top_order-1, arg),
+                    "bessel_jv_two_result"))
+
         if order == top_order:
-            return prim.CommonSubexpression(
-                    self.bessel_j_impl(order, arg),
-                    "bessel_j_%d" % order)
+            return prim.Lookup(bessel_two, "jvp1")
         elif order == top_order-1:
-            return prim.CommonSubexpression(
-                    self.bessel_j_impl(order, arg),
-                    "bessel_j_%d" % order)
+            return prim.Lookup(bessel_two, "jv")
         elif order < 0:
             return (-1)**order*self.bessel_j(-order, arg)
         else:
diff --git a/sumpy/qbx.py b/sumpy/qbx.py
index 9ea1ef2ac65c668a3346f9ab73963a8d59a2fec9..3138ce19ed403bf02894226dc7baecbde63d7f8e 100644
--- a/sumpy/qbx.py
+++ b/sumpy/qbx.py
@@ -168,16 +168,17 @@ class LayerPotentialBase(KernelComputation):
 
         import pyopencl as cl
         dev = self.context.devices[0]
-        if dev.type & cl.device_type.CPU:
-            loopy_knl = lp.split_iname(loopy_knl, "itgt", 16, outer_tag="g.0",
-                    inner_tag="l.0")
-            loopy_knl = lp.split_iname(loopy_knl, "isrc", 256)
-            loopy_knl = lp.set_loop_priority(loopy_knl,
-                    ["isrc_outer", "itgt_inner"])
-        else:
-            from warnings import warn
-            warn("don't know how to tune layer potential computation for '%s'" % dev)
-            loopy_knl = lp.split_iname(loopy_knl, "itgt", 128, outer_tag="g.0")
+        if 0:
+            if dev.type & cl.device_type.CPU:
+                loopy_knl = lp.split_iname(loopy_knl, "itgt", 16, outer_tag="g.0",
+                        inner_tag="l.0")
+                loopy_knl = lp.split_iname(loopy_knl, "isrc", 256)
+                loopy_knl = lp.set_loop_priority(loopy_knl,
+                        ["isrc_outer", "itgt_inner"])
+            else:
+                from warnings import warn
+                warn("don't know how to tune layer potential computation for '%s'" % dev)
+                loopy_knl = lp.split_iname(loopy_knl, "itgt", 128, outer_tag="g.0")
 
         return loopy_knl
 
@@ -250,6 +251,8 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase):
     def __call__(self, queue, targets, sources, centers, **kwargs):
         knl = self.get_optimized_kernel()
 
+        knl = lp.set_options(knl, cl_build_options=["-g", "-O0"])
+
         return knl(queue, src=sources, tgt=targets, center=centers,
                 **kwargs)
 
diff --git a/test/test_kernels.py b/test/test_kernels.py
index ad53759e12126f7eca109952075f6dcb18f656bd..c44324d5df2d2c9aab349a7201cee9dc3220b33c 100644
--- a/test/test_kernels.py
+++ b/test/test_kernels.py
@@ -88,21 +88,26 @@ def test_p2p(ctx_getter):
     assert rel_err < 1e-3
 
 
-@pytest.mark.parametrize("order", [2, 3, 4, 5])
-@pytest.mark.parametrize(("knl", "expn_class"), [
+@pytest.mark.parametrize("order", [4])
+@pytest.mark.parametrize(("base_knl", "expn_class"), [
     (LaplaceKernel(2), VolumeTaylorLocalExpansion),
     (LaplaceKernel(2), VolumeTaylorMultipoleExpansion),
 
     (HelmholtzKernel(2), VolumeTaylorMultipoleExpansion),
     (HelmholtzKernel(2), VolumeTaylorLocalExpansion),
     (HelmholtzKernel(2), H2DLocalExpansion),
-    (HelmholtzKernel(2), H2DMultipoleExpansion)
+    (HelmholtzKernel(2), H2DMultipoleExpansion),
+
+    (HelmholtzKernel(2, allow_evanescent=True), VolumeTaylorMultipoleExpansion),
+    (HelmholtzKernel(2, allow_evanescent=True), VolumeTaylorLocalExpansion),
+    (HelmholtzKernel(2, allow_evanescent=True), H2DLocalExpansion),
+    (HelmholtzKernel(2, allow_evanescent=True), H2DMultipoleExpansion),
     ])
 @pytest.mark.parametrize("with_source_derivative", [
     False,
     True
     ])
-def test_p2e2p(ctx_getter, knl, expn_class, order, with_source_derivative):
+def test_p2e2p(ctx_getter, base_knl, expn_class, order, with_source_derivative):
     #logging.basicConfig(level=logging.INFO)
 
     from sympy.core.cache import clear_cache
@@ -117,11 +122,16 @@ def test_p2e2p(ctx_getter, knl, expn_class, order, with_source_derivative):
     nsources = 100
 
     extra_kwargs = {}
-    if isinstance(knl, HelmholtzKernel):
-        extra_kwargs["k"] = 0.05
+    if isinstance(base_knl, HelmholtzKernel):
+        if base_knl.allow_evanescent:
+            extra_kwargs["k"] = 0.2 * (0.707 + 0.707j)
+        else:
+            extra_kwargs["k"] = 0.2
 
     if with_source_derivative:
-        knl = DirectionalSourceDerivative(knl, "dir_vec")
+        knl = DirectionalSourceDerivative(base_knl, "dir_vec")
+    else:
+        knl = base_knl
 
     out_kernels = [
             knl,
@@ -277,6 +287,14 @@ def test_p2e2p(ctx_getter, knl, expn_class, order, with_source_derivative):
         slack += 1
         grad_slack += 2
 
+    if isinstance(base_knl, HelmholtzKernel) and base_knl.allow_evanescent:
+        slack += 0.5
+        grad_slack += 0.5
+
+        if expn_class is VolumeTaylorMultipoleExpansion:
+            slack += 0.3
+            grad_slack += 0.3
+
     assert eoc_rec_pot.order_estimate() > tgt_order - slack
     assert eoc_rec_grad_x.order_estimate() > tgt_order_grad - grad_slack