diff --git a/examples/matrix-ops.py b/examples/matrix-ops.py
index 3bb60fcb802a7f9ba39cc78c31be7514ab5a27e9..30f01f6c5a448cc3f33a157a3207037ea2d889ff 100644
--- a/examples/matrix-ops.py
+++ b/examples/matrix-ops.py
@@ -16,12 +16,12 @@ def make_well_condition_dev_matrix(queue, n, dtype=np.float32):
 
 
 def plain_matrix_mul(ctx_factory=cl.create_some_context):
-    dtype = np.float64
+    dtype = np.float32
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx,
             properties=cl.command_queue_properties.PROFILING_ENABLE)
 
-    n = 16*100
+    n = 16*10
     from pymbolic import var
     a, b, c, i, j, k, n_sym = [var(s) for s in "abcijkn"]
 
@@ -31,9 +31,14 @@ def plain_matrix_mul(ctx_factory=cl.create_some_context):
         lp.LoopDimension("j", n),
         lp.LoopDimension("k", n),
         ], [
-        (c[i*n+j], a[i*n+k]*b[k*n+j])
+        (c[i, j], a[i, k]*b[k, j])
+        ],
+        [
+            lp.ArrayArg("a", dtype, shape=(n, n)),
+            lp.ArrayArg("b", dtype, shape=(n, n)),
+            lp.ArrayArg("c", dtype, shape=(n, n)),
         ],
-        default_vector_type=dtype, name="matmul")
+        name="matmul")
 
     knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
     knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
@@ -48,14 +53,14 @@ def plain_matrix_mul(ctx_factory=cl.create_some_context):
     a = make_well_condition_dev_matrix(queue, n, dtype=dtype)
     b = make_well_condition_dev_matrix(queue, n, dtype=dtype)
     c = cl_array.empty_like(a)
-    refsol = np.dot(a.astype(np.float64).get(), b.astype(np.float64).get())
+    refsol = np.dot(a.get(), b.get())
 
     def launcher(kernel, gsize, lsize, check):
         evt = kernel(queue, gsize(), lsize(), a.data, b.data, c.data,
                 g_times_l=True)
 
         if check:
-            sol = c.astype(np.float64).get()
+            sol = c.get()
             rel_err = la.norm(refsol-sol, "fro")/la.norm(refsol, "fro")
             assert rel_err < 1e-5, rel_err
 
@@ -205,4 +210,4 @@ if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
-        fancy_matrix_mul()
+        plain_matrix_mul()
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 43b20c47d2768867c83eaa0fcee489f3110d35d6..395c198e1d69a7b6ef907cf0072a4f2c300a2324 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -22,9 +22,7 @@ import pyopencl as cl
 # TODO:   - Tricky: Convolution, FD
 # TODO: Try, fix indirect addressing
 
-# TODO: Play with multi-d data layout (optionally?)
 # TODO: Custom reductions per red. axis
-# TODO: Switch to sympy, serve multiple accesses with one prefetch
 # TODO: Vectorize
 # TODO: Unroll
 # TODO: Parallelize reduction
@@ -112,7 +110,7 @@ def parse_tag(tag):
 # {{{ loop dim, loop domain, kernel
 
 class LoopDimension(Record):
-    __slots__ = ["name", "length", "last_cond",  "tag", 
+    __slots__ = ["name", "length", "last_cond", "tag", 
             "end_cond", "end_cond_if_last_of"]
 
     def __init__(self, name, length=None, last_cond=None, end_cond=None, tag=None, 
@@ -232,11 +230,9 @@ class ArrayArg:
                     c_contiguous_strides)
 
             if order == "F":
-                strides = _f_contiguous_strides(
-                        dtype.itemsize, shape)
+                strides = f_contiguous_strides(1, shape)
             elif order == "C":
-                strides = _c_contiguous_strides(
-                        dtype.itemsize, shape)
+                strides = c_contiguous_strides(1, shape)
             else:
                 raise ValueError("invalid order: %s" % order)
 
@@ -274,7 +270,7 @@ class LoopKernel(LoopDomain):
     # - preamble
 
     def __init__(self, device, dims, instructions, args=None, prefetch={}, schedule=None,
-            register_prefetch=None, default_vector_type=None, name="loopy_kernel",
+            register_prefetch=None, name="loopy_kernel",
             preamble=None):
         from pymbolic import parse
 
@@ -295,12 +291,6 @@ class LoopKernel(LoopDomain):
                 register_prefetch=register_prefetch, name=name,
                 preamble=preamble)
 
-        if args is None:
-            self.args = [
-                    ArrayArg(name, default_vector_type)
-                    for name in
-                    sorted(self.input_vectors()) + sorted(self.output_vectors())]
-
     @property
     @memoize_method
     def arg_dict(self):
@@ -614,6 +604,9 @@ class VariableIndexExpressionCollector(CombineMapper):
 
 
 class StrideCollector(RecursiveMapper):
+    def __init__(self, arg):
+        self.arg = arg
+
     def map_sum(self, expr):
         stride_dicts = [self.rec(ch) for ch in expr.children]
 
@@ -661,6 +654,11 @@ class StrideCollector(RecursiveMapper):
     def map_variable(self, expr):
         return {expr.name: 1}
 
+    def map_tuple(self, expr):
+        return self.rec(sum(
+            stride*expr_i for stride, expr_i in zip(
+                self.arg.strides, expr)))
+
     def map_subscript(self, expr):
         raise RuntimeError("cannot gather strides--indirect addressing in use")
 
@@ -838,7 +836,7 @@ def insert_register_prefetches(kernel):
 # {{{ code generation
 
 class LoopyCCodeMapper(CCodeMapper):
-    def __init__(self, kernel):
+    def __init__(self, kernel, no_prefetch=False):
         def constant_mapper(c):
             if isinstance(c, float):
                 # FIXME: type-variable
@@ -849,9 +847,12 @@ class LoopyCCodeMapper(CCodeMapper):
         CCodeMapper.__init__(self, constant_mapper=constant_mapper)
         self.kernel = kernel
 
+        self.no_prefetch = no_prefetch
+
     def map_subscript(self, expr, enclosing_prec):
         from pymbolic.primitives import Variable
-        if (isinstance(expr.aggregate, Variable)
+        if (not self.no_prefetch
+                and isinstance(expr.aggregate, Variable)
                 and expr.aggregate.name in self.kernel.input_vectors()):
             try:
                 pf = self.kernel.prefetch[expr.aggregate.name, expr.index]
@@ -861,6 +862,16 @@ class LoopyCCodeMapper(CCodeMapper):
                 return pf.name+"".join(
                         "[%s]" % dim.name for dim in pf.dims)
 
+        if (isinstance(expr.aggregate, Variable)
+                and isinstance(expr.index, tuple)):
+            arg = self.kernel.arg_dict[expr.aggregate.name]
+
+            from pymbolic.primitives import Subscript
+            return CCodeMapper.map_subscript(self,
+                    Subscript(expr.aggregate, sum(
+                        stride*expr_i for stride, expr_i in zip(
+                            arg.strides, expr.index))), enclosing_prec)
+
         return CCodeMapper.map_subscript(self, expr, enclosing_prec)
 
 
@@ -928,7 +939,7 @@ def generate_prefetch_code(ccm, kernel, sched_index, last_of):
 
     # {{{ next use the work group dimensions, least-stride dim first
 
-    strides = StrideCollector()(pf.index_expr)
+    strides = StrideCollector(kernel.arg_dict[pf.input_vector])(pf.index_expr)
 
     approximate_arg_values = dict(
             (arg.name, arg.approximately)
@@ -969,19 +980,24 @@ def generate_prefetch_code(ccm, kernel, sched_index, last_of):
 
     # {{{ generate fetch code
 
+    no_pf_ccm = LoopyCCodeMapper(kernel, no_prefetch=True)
+
     def make_fetch_loop_nest(pf_dim_idx, pf_dim_exprs=[], pf_idx_subst_map={}):
         # may mutate kernel for prefetch dim enlargement
 
         from pymbolic.mapper.substitutor import substitute
         if pf_dim_idx >= len(pf.dims):
             # done, return
+            from pymbolic.primitives import Variable, Subscript
+
             return Assign(
                     pf.name + "".join("[%s]" % ccm(dexpr, PREC_NONE)
                         for dexpr in pf_dim_exprs),
-                    "%s[%s]"
-                    % (pf.input_vector,
-                        substitute(pf.index_expr, pf_idx_subst_map))
-                    )
+                    no_pf_ccm(
+                        Subscript(
+                            Variable(pf.input_vector),
+                            substitute(pf.index_expr, pf_idx_subst_map)),
+                        PREC_NONE))
 
         pf_dim = pf.dims[pf_dim_idx]
         realiz_dim_list = realization_dims[pf_dim_idx]
@@ -1433,6 +1449,7 @@ class CompiledKernel:
     def __init__(self, context, kernel, size_args=None):
         self.kernel = kernel
         self.code = generate_code(kernel)
+        print self.code
         self.cl_kernel = getattr(
                 cl.Program(context, self.code).build(),
                 kernel.name)