diff --git a/test/test_fem_assembly.py b/test/test_fem_assembly.py
index 2c70c2693355f307010e84279487175551179334..4a580725e3bceae5158d058a21ea3ee0e32d5335 100644
--- a/test/test_fem_assembly.py
+++ b/test/test_fem_assembly.py
@@ -30,7 +30,6 @@ def test_laplacian_stiffness(ctx_factory):
     from pymbolic import var
     Nc_sym = var("Nc")
 
-    print "[Nc] -> {[K,i,j,q]: 0<=K<Nc and 0<=i,j<%(Nb)d and 0<=q<%(Nq)d}" % dict(Nb=Nb, Nq=Nq),
     knl = lp.make_kernel(ctx.devices[0],
             "[Nc] -> {[K,i,j,q]: 0<=K<Nc and 0<=i,j<%(Nb)d and 0<=q<%(Nq)d}" 
             % dict(Nb=Nb, Nq=Nq),
@@ -42,11 +41,11 @@ def test_laplacian_stiffness(ctx_factory):
 
                 ],
             [
-            lp.ArrayArg("jacInv", dtype, shape=(Nc, Nq, dim, dim), order=order),
+            lp.ArrayArg("jacInv", dtype, shape=(Nc_sym, Nq, dim, dim), order=order),
             lp.ConstantArrayArg("DPsi", dtype, shape=(Nb, Nq, dim), order=order),
-            lp.ArrayArg("jacDet", dtype, shape=(Nc, Nq), order=order),
+            lp.ArrayArg("jacDet", dtype, shape=(Nc_sym, Nq), order=order),
             lp.ConstantArrayArg("w", dtype, shape=(Nq, dim), order=order),
-            lp.ArrayArg("A", dtype, shape=(Nc, Nb, Nb), order=order),
+            lp.ArrayArg("A", dtype, shape=(Nc_sym, Nb, Nb), order=order),
             lp.ScalarArg("Nc",  np.int32, approximately=1000),
             ],
             name="semlap", assumptions="Nc>=1")
@@ -56,17 +55,16 @@ def test_laplacian_stiffness(ctx_factory):
     knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1))
     knl = lp.split_dimension(knl, "K_inner", 4, inner_tag="ilp")
     knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"})
-    knl = lp.add_prefetch(knl, 'jacInv', ["K_inner_outer", "K_inner_inner", "q"],
-            uni_template="jacInv[x,y,z,u]")
+    knl = lp.add_prefetch(knl, 'jacInv', ["Kii", "Kio", "q", "x", "y"],
+            uni_template="jacInv[Kii + 4*Kio +16*Ko,q,x,y]")
 
     kernel_gen = lp.generate_loop_schedules(knl,
             loop_priority=["K", "i", "j"])
-    kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=1000))
+    kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc))
 
-    Nc = 1000
     lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen,
             op_count=0, op_label="GFlops",
-            parameters={"Nc": 1000}, print_seq_code=True,
+            parameters={"Nc": Nc}, print_seq_code=True,
             timing_rounds=30)
 
 
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 329384071e98471202c7da7bb78e8fbdcbd060ae..07fcde54bc5771f2396e8b6c01ed5c95201977a6 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -132,8 +132,8 @@ def test_axpy(ctx_factory):
     #y = cl_array.to_device(queue, np.random.rand(n).astype(dtype))
     x = cl_random.rand(queue, n, dtype=dtype, luxury=2)
     y = cl_random.rand(queue, n, dtype=dtype, luxury=2)
-    print np.isnan(x.get()).any()
-    1/0
+    #print np.isnan(x.get()).any()
+    #1/0
     z = cl_array.zeros_like(x)
     refsol = (2*x+3*y).get()
 
@@ -161,8 +161,6 @@ def test_transpose(ctx_factory):
     dtype = np.dtype(np.float32)
     ctx = ctx_factory()
     order = "C"
-    queue = cl.CommandQueue(ctx,
-            properties=cl.command_queue_properties.PROFILING_ENABLE)
 
     n = get_suitable_size(ctx)
 
@@ -370,7 +368,8 @@ def test_rank_one(ctx_factory):
 
     seq_knl = knl
 
-    for variant in [variant_1, variant_2, variant_4]:
+    #for variant in [variant_1, variant_2, variant_4]:
+    for variant in [variant_4]:
         kernel_gen = lp.generate_loop_schedules(variant(knl))
         kernel_gen = lp.check_kernels(kernel_gen, dict(n=n))
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index f74848a63fe3cb14e8b422f781af91e8453addf8..31bf59d4bd0ed70f84b98efd729e54ab9d3c78d1 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -101,15 +101,12 @@ def test_bad_stencil(ctx_factory):
                 lp.ArrayArg("a", np.float32, shape=(32,32,))
                 ])
 
-    def variant_1(knl):
-        return knl
-
     def variant_2(knl):
-        knl = lp.split_dimension(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
+        knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.0")
         knl = lp.realize_cse(knl, None, np.float32, ["i_inner", "j"])
         return knl
 
-    for variant in [variant_1, variant_2]:
+    for variant in [variant_2]:
         kernel_gen = lp.generate_loop_schedules(variant(knl),
                 loop_priority=["i_outer", "i_inner_0", "j_0"])
         kernel_gen = lp.check_kernels(kernel_gen)
diff --git a/test/test_sem.py b/test/test_sem.py
index 3e593cb0ad8f3155dcb90319ff27642df05c131e..19f854ce9b25f6fb7a2df0b3a2c849ddc1c225c8 100644
--- a/test/test_sem.py
+++ b/test/test_sem.py
@@ -54,29 +54,31 @@ def test_sem_3d(ctx_factory):
             name="semlap", assumptions="K>=1")
 
 
-    def add_pf(knl):
-        knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]")
-        knl = lp.add_prefetch(knl, "D", ["m", "j"])
-        knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]")
-        knl = lp.realize_cse(knl, "ur", np.float32, ["k", "j", "m"])
-        knl = lp.realize_cse(knl, "us", np.float32, ["i", "m", "k"])
-        knl = lp.realize_cse(knl, "ut", np.float32, ["i", "j", "m"])
-
-    seq_knl = add_pf(knl)
+    knl = lp.realize_cse(knl, "ur", np.float32, ["k", "j", "m"])
+    knl = lp.realize_cse(knl, "us", np.float32, ["i", "m", "k"])
+    knl = lp.realize_cse(knl, "ut", np.float32, ["i", "j", "m"])
+
+    if 0:
+        seq_knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]")
+        seq_knl = lp.add_prefetch(seq_knl, "D", ["m", "j"])
+        seq_knl = lp.add_prefetch(seq_knl, "u", ["i", "j", "k"], "u[*,i,j,k]")
+    else:
+        seq_knl = knl
 
     knl = lp.split_dimension(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))
+
+    knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]")
+    knl = lp.add_prefetch(knl, "D", ["m", "j"])
+    knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]")
     #knl = lp.split_dimension(knl, "e_inner", 4, inner_tag="ilp")
 
-    knl = add_pf(knl)
     #print seq_knl
-    #print lp.preprocess_kernel(seq_knl)
+    #print lp.preprocess_kernel(knl)
     #1/0
 
-
     knl = lp.tag_dimensions(knl, dict(i="l.0", j="l.1"))
 
-    kernel_gen = lp.generate_loop_schedules(knl,
-            loop_priority=["j_dr", "j_ds",  "i_dt"])
+    kernel_gen = lp.generate_loop_schedules(knl)
     kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))
 
     K = 1000