diff --git a/MEMO b/MEMO
index b2dd2d070b8c820d4e4e0706964848b0c8459bb0..95d8f452775438126ae2eb75f008cafdd13c2745 100644
--- a/MEMO
+++ b/MEMO
@@ -61,13 +61,15 @@ To-do
 
 - add_prefetch gets a flag to separate out each access
 
-- Making parameters run-time varying, substituting values that
-  depend on other inames?
+- Allow parameters to be varying during run-time varying, substituting values
+  that depend on other inames?
 
 - Fix all tests
 
 - Scalar insn priority
 
+- : in prefetches
+
 Future ideas
 ^^^^^^^^^^^^
 
@@ -75,9 +77,6 @@ Future ideas
 
 - String instructions?
 
-- How is intra-instruction ordering of ILP loops going to be determined?
-  (taking into account that it could vary even per-instruction?)
-
 - Barriers for data exchanged via global vars?
 
 - Float4 joining on fetch/store?
@@ -88,8 +87,6 @@ Future ideas
 - Better for loop bound generation
   -> Try a triangular loop
 
-- Sharing of checks across ILP instances
-
 - Eliminate the first (pre-)barrier in a loop.
 
 - Generate automatic test against sequential code.
@@ -115,6 +112,11 @@ Future ideas
 Dealt with
 ^^^^^^^^^^
 
+- How is intra-instruction ordering of ILP loops going to be determined?
+  (taking into account that it could vary even per-instruction?)
+
+- Sharing of checks across ILP instances
+
 - Differentiate ilp.unr from ilp.seq
 
 - Allow complex-valued arithmetic, despite CL's best efforts.
diff --git a/loopy/__init__.py b/loopy/__init__.py
index fdb258faa0e840fd30c5ad654d1aea03b8cc2756..8633d71defa850293b2e6c8a1dfd80b812275800 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -205,6 +205,8 @@ def split_dimension(kernel, split_iname, inner_length,
     if split_iname not in kernel.all_inames():
         raise ValueError("cannot split loop for unknown variable '%s'" % split_iname)
 
+    applied_substitutions = kernel.applied_substitutions[:]
+
     if outer_iname is None:
         outer_iname = split_iname+"_outer"
     if inner_iname is None:
@@ -248,6 +250,7 @@ def split_dimension(kernel, split_iname, inner_length,
     new_insns = []
     for insn in kernel.instructions:
         subst_map = {var(split_iname): new_loop_index}
+        applied_substitutions.append(subst_map)
 
         from loopy.symbolic import SubstitutionMapper
         subst_mapper = SubstitutionMapper(subst_map.get)
@@ -277,6 +280,7 @@ def split_dimension(kernel, split_iname, inner_length,
             .copy(domain=new_domain,
                 iname_slab_increments=iname_slab_increments,
                 instructions=new_insns,
+                applied_substitutions=applied_substitutions,
                 ))
 
     return tag_dimensions(result, {outer_iname: outer_tag, inner_iname: inner_tag})
@@ -367,7 +371,10 @@ def join_dimensions(kernel, inames, new_iname=None, tag=AutoFitLocalIndexTag()):
 
     result = (kernel
             .map_expressions(subst_map, exclude_instructions=True)
-            .copy(instructions=new_insns, domain=new_domain))
+            .copy(
+                instructions=new_insns, domain=new_domain,
+                applied_substitutions=kernel.applied_substitutions + [subst_map]
+                ))
 
     return tag_dimensions(result, {new_iname: tag})
 
@@ -417,8 +424,43 @@ def tag_dimensions(kernel, iname_to_tag, force=False):
 
 # {{{ convenience: add_prefetch
 
-def add_prefetch(kernel, var_name, sweep_dims=[], dim_arg_names=None,
-        default_tag="l.auto", rule_name=None):
+def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
+        default_tag="l.auto", rule_name=None, footprint_indices=None):
+    """Prefetch all accesses to the variable *var_name*, with all accesses
+    being swept through *sweep_inames*.
+
+    :ivar dim_arg_names: List of names representing each fetch axis.
+    :ivar rule_name: base name of the generated temporary variable.
+    :ivar footprint_indices: A list of tuples indicating the index set used
+        to generate the footprint.
+
+        If only one such set of indices is desired, this may also be specified
+        directly by putting an index expression into *var_name*. Substitutions
+        such as those occurring in dimension splits are recorded and also
+        applied to these indices.
+    """
+
+    # {{{ fish indexing out of var_name and into sweep_indices
+
+    from loopy.symbolic import parse
+    parsed_var_name = parse(var_name)
+
+    from pymbolic.primitives import Variable, Subscript
+    if isinstance(parsed_var_name, Variable):
+        # nothing to see
+        pass
+    elif isinstance(parsed_var_name, Subscript):
+        if footprint_indices is not None:
+            raise TypeError("if footprint_indices is specified, then var_name "
+                    "may not contain a subscript")
+
+        assert isinstance(parsed_var_name.aggregate, Variable)
+        var_name = parsed_var_name.aggregate.name
+        sweep_indices = [parsed_var_name.index]
+    else:
+        raise ValueError("var_name must either be a variable name or a subscript")
+
+    # }}}
 
     if rule_name is None:
         rule_name = kernel.make_unique_var_name("%s_fetch" % var_name)
@@ -448,19 +490,57 @@ def add_prefetch(kernel, var_name, sweep_dims=[], dim_arg_names=None,
     kernel = extract_subst(kernel, rule_name, uni_template, parameters)
 
     new_fetch_dims = []
-    for fd in sweep_dims:
+    for fd in sweep_inames:
         if isinstance(fd, int):
             new_fetch_dims.append(parameters[fd])
         else:
             new_fetch_dims.append(fd)
 
-    return precompute(kernel, rule_name, arg.dtype, sweep_dims,
+    footprint_generators = None
+
+    if sweep_indices is not None:
+        if not isinstance(sweep_indices, (list, tuple)):
+            sweep_indices = [sweep_indices]
+
+        def standardize_sweep_indices(si):
+            if isinstance(si, str):
+                from loopy.symbolic import parse
+                si = parse(si)
+
+            if not isinstance(si, tuple):
+                si = (si,)
+
+            if len(si) != arg.dimensions:
+                raise ValueError("sweep index '%s' has the wrong number of dimensions")
+
+            for subst_map in kernel.applied_substitutions:
+                from loopy.symbolic import SubstitutionMapper
+                from pymbolic.mapper.substitutor import make_subst_func
+                si = SubstitutionMapper(make_subst_func(subst_map))(si)
+
+            return si
+
+        sweep_indices = [standardize_sweep_indices(si) for si in sweep_indices]
+
+        from pymbolic.primitives import Variable
+        footprint_generators = [
+                Variable(var_name)(*si) for si in sweep_indices]
+
+    new_kernel = precompute(kernel, rule_name, arg.dtype, sweep_inames,
+            footprint_generators=footprint_generators,
             new_storage_axis_names=dim_arg_names,
             default_tag=default_tag)
 
-# }}}
+    # If the rule survived past precompute() (i.e. some accesses fell outside
+    # the footprint), get rid of it before moving on.
+    if rule_name in new_kernel.substitutions:
+        return apply_subst(new_kernel, rule_name)
+    else:
+        return new_kernel
 
 
+# }}}
+
 
 
 
diff --git a/loopy/cse.py b/loopy/cse.py
index 82b43d25d0311bb9f8dff46e28eb8d054d7d8570..b16983f5e3bbf8eeddcc2d7c84542065131b5c54 100644
--- a/loopy/cse.py
+++ b/loopy/cse.py
@@ -13,7 +13,7 @@ from pymbolic import var
 
 
 class InvocationDescriptor(Record):
-    __slots__ = ["expr", "args", ]
+    __slots__ = ["expr", "args", "expands_footprint", "is_in_footprint"]
 
 
 
@@ -38,70 +38,105 @@ def to_parameters_or_project_out(param_inames, set_inames, set):
 
 # {{{ construct storage->sweep map
 
-def construct_storage_to_sweep_map(invocation_descriptors, domain_dup_sweep,
+def build_per_access_storage_to_sweep_map(invdesc, domain_dup_sweep,
         storage_axis_names, storage_axis_sources, prime_sweep_inames):
 
-    # The storage map goes from storage axes to domain_dup_sweep.
-    # The first len(arg_names) storage dimensions are the rule's arguments.
+    map_space = domain_dup_sweep.get_space()
+    stor_dim = len(storage_axis_names)
+    rn = map_space.dim(dim_type.out)
 
-    result = None
+    map_space = map_space.add_dims(dim_type.in_, stor_dim)
+    for i, saxis in enumerate(storage_axis_names):
+        # arg names are initially primed, to be replaced with unprimed
+        # base-0 versions below
 
-    for invdesc in invocation_descriptors:
-        map_space = domain_dup_sweep.get_space()
-        stor_dim = len(storage_axis_names)
-        rn = map_space.dim(dim_type.out)
+        map_space = map_space.set_dim_name(dim_type.in_, i, saxis+"'")
 
-        map_space = map_space.add_dims(dim_type.in_, stor_dim)
-        for i, saxis in enumerate(storage_axis_names):
-            # arg names are initially primed, to be replaced with unprimed
-            # base-0 versions below
+    # map_space: [stor_axes'] -> [domain](dup_sweep_index)[dup_sweep]
 
-            map_space = map_space.set_dim_name(dim_type.in_, i, saxis+"'")
+    set_space = map_space.move_dims(
+            dim_type.out, rn,
+            dim_type.in_, 0, stor_dim).range()
 
-        # map_space: [stor_axes'] -> [domain](dup_sweep_index)[dup_sweep]
+    # set_space: [domain](dup_sweep_index)[dup_sweep][stor_axes']
 
-        set_space = map_space.move_dims(
-                dim_type.out, rn,
-                dim_type.in_, 0, stor_dim).range()
+    stor2sweep = None
 
-        # set_space: [domain](dup_sweep_index)[dup_sweep][stor_axes']
+    from loopy.symbolic import aff_from_expr
 
-        stor2sweep = None
+    for saxis, saxis_source in zip(storage_axis_names, storage_axis_sources):
+        if isinstance(saxis_source, int):
+            # an argument
+            cns = isl.Constraint.equality_from_aff(
+                    aff_from_expr(set_space,
+                        var(saxis+"'")
+                        - prime_sweep_inames(invdesc.args[saxis_source])))
+        else:
+            # a 'bare' sweep iname
+            cns = isl.Constraint.equality_from_aff(
+                    aff_from_expr(set_space,
+                        var(saxis+"'")
+                        - prime_sweep_inames(var(saxis_source))))
 
-        from loopy.symbolic import aff_from_expr
+        cns_map = isl.BasicMap.from_constraint(cns)
+        if stor2sweep is None:
+            stor2sweep = cns_map
+        else:
+            stor2sweep = stor2sweep.intersect(cns_map)
 
-        for saxis, saxis_source in zip(storage_axis_names, storage_axis_sources):
-            if isinstance(saxis_source, int):
-                # an argument
-                cns = isl.Constraint.equality_from_aff(
-                        aff_from_expr(set_space,
-                            var(saxis+"'")
-                            - prime_sweep_inames(invdesc.args[saxis_source])))
-            else:
-                # a 'bare' sweep iname
-                cns = isl.Constraint.equality_from_aff(
-                        aff_from_expr(set_space,
-                            var(saxis+"'")
-                            - prime_sweep_inames(var(saxis_source))))
-
-            cns_map = isl.BasicMap.from_constraint(cns)
-            if stor2sweep is None:
-                stor2sweep = cns_map
+    stor2sweep = stor2sweep.move_dims(
+            dim_type.in_, 0,
+            dim_type.out, rn, stor_dim)
+
+    # stor2sweep is back in map_space
+    return stor2sweep
+
+def build_global_storage_to_sweep_map(invocation_descriptors, domain_dup_sweep,
+        storage_axis_names, storage_axis_sources, prime_sweep_inames):
+    """
+    As a side effect, this fills out is_in_footprint in the
+    invocation descriptors.
+    """
+
+    # The storage map goes from storage axes to domain_dup_sweep.
+    # The first len(arg_names) storage dimensions are the rule's arguments.
+
+    global_stor2sweep = None
+
+    # build footprint
+    for invdesc in invocation_descriptors:
+        if invdesc.expands_footprint:
+            stor2sweep = build_per_access_storage_to_sweep_map(invdesc, domain_dup_sweep,
+                    storage_axis_names, storage_axis_sources, prime_sweep_inames)
+
+            if global_stor2sweep is None:
+                global_stor2sweep = stor2sweep
             else:
-                stor2sweep = stor2sweep.intersect(cns_map)
+                global_stor2sweep = global_stor2sweep.union(stor2sweep)
+
+            invdesc.is_in_footprint = True
+
+    if isinstance(global_stor2sweep, isl.BasicMap):
+        global_stor2sweep = isl.Map.from_basic_map(stor2sweep)
+    global_stor2sweep = global_stor2sweep.intersect_range(domain_dup_sweep)
 
-        stor2sweep = stor2sweep.move_dims(
-                dim_type.in_, 0,
-                dim_type.out, rn, stor_dim)
+    # check if non-footprint-building invocation descriptors fall into footprint
+    for invdesc in invocation_descriptors:
+        stor2sweep = build_per_access_storage_to_sweep_map(invdesc, domain_dup_sweep,
+                    storage_axis_names, storage_axis_sources, prime_sweep_inames)
+
+        if isinstance(stor2sweep, isl.BasicMap):
+            stor2sweep = isl.Map.from_basic_map(stor2sweep)
 
-        # stor2sweep is back in map_space
+        stor2sweep = stor2sweep.intersect_range(domain_dup_sweep)
 
-        if result is None:
-            result = stor2sweep
+        if not invdesc.expands_footprint:
+            invdesc.is_in_footprint = stor2sweep.is_subset(global_stor2sweep)
         else:
-            result = result.union(stor2sweep)
+            assert stor2sweep.domain().is_subset(global_stor2sweep.domain())
+
 
-    return result
+    return global_stor2sweep
 
 # }}}
 
@@ -157,14 +192,10 @@ def get_access_info(kernel, subst_name,
 
     # }}}
 
-    stor2sweep = construct_storage_to_sweep_map(
+    stor2sweep = build_global_storage_to_sweep_map(
             invocation_descriptors, domain_dup_sweep,
             storage_axis_names, storage_axis_sources, prime_sweep_inames)
 
-    if isinstance(stor2sweep, isl.BasicMap):
-        stor2sweep = isl.Map.from_basic_map(stor2sweep)
-    stor2sweep = stor2sweep.intersect_range(domain_dup_sweep)
-
     storage_base_indices, storage_shape = compute_bounds(
             kernel, subst_name, stor2sweep, sweep_inames,
             storage_axis_names)
@@ -186,7 +217,7 @@ def get_access_info(kernel, subst_name,
     # }}}
 
     # {{{ subtract off the base indices
-    # add the new, base-0 as new in dimensions
+    # add the new, base-0 indices as new in dimensions
 
     sp = stor2sweep.get_space()
     stor_idx = sp.dim(dim_type.out)
@@ -251,6 +282,7 @@ def simplify_via_aff(expr):
 
 
 def precompute(kernel, subst_name, dtype, sweep_axes=[],
+        footprint_generators=None,
         storage_axes=None, new_storage_axis_names=None, storage_axis_to_tag={},
         default_tag="l.auto"):
     """Precompute the expression described in the substitution rule *subst_name*
@@ -258,8 +290,13 @@ def precompute(kernel, subst_name, dtype, sweep_axes=[],
     a list of *sweep_axes* (order irrelevant) and an ordered list of *storage_axes*
     (whose order will describe the axis ordering of the temporary array).
 
-    This function will then examine all usage sites of the substitution rule and
-    determine what the storage footprint of that sweep is.
+    *subst_name* may contain a period (".") to filter out a subset of the
+    usage sites of the substitution rule. (Namely those usage sites that
+    use the same dotted name.)
+
+    This function will then examine the *footprint_generators* (or all usage
+    sites of the substitution rule if not specified) and determine what the
+    storage footprint of that sweep is.
 
     The following cases can arise for each sweep axis:
 
@@ -276,7 +313,7 @@ def precompute(kernel, subst_name, dtype, sweep_axes=[],
       the so-named formal argument at *all* usage sites.
 
     :arg sweep_axes: A :class:`list` of inames and/or rule argument names to be swept.
-    :arg storage_dims: A :class:`list` of inames and/or rule argument names/indices to be used as storage axes.
+    :arg storage_axes: A :class:`list` of inames and/or rule argument names/indices to be used as storage axes.
 
     If `storage_axes` is not specified, it defaults to the arrangement
     `<direct sweep axes><arguments>` with the direct sweep axes being the
@@ -312,12 +349,32 @@ def precompute(kernel, subst_name, dtype, sweep_axes=[],
                     "exclusively of inames" % expr)
 
         invocation_descriptors.append(
-                InvocationDescriptor(expr=expr, args=args))
+                InvocationDescriptor(expr=expr, args=args,
+                    expands_footprint=footprint_generators is None))
         return expr
 
     from loopy.symbolic import SubstitutionCallbackMapper
     scm = SubstitutionCallbackMapper([(subst_name, subst_instance)], gather_substs)
 
+    if footprint_generators:
+        for fpg in footprint_generators:
+            if isinstance(fpg, str):
+                from loopy.symbolic import parse
+                fpg = parse(fpg)
+
+            from pymbolic.primitives import Variable, Call
+            if isinstance(fpg, Variable):
+                args = ()
+            elif isinstance(fpg, Call):
+                args = fpg.parameters
+            else:
+                raise ValueError("footprint generator must "
+                        "be substitution rule invocation")
+
+            invocation_descriptors.append(
+                    InvocationDescriptor(expr=fpg, args=args,
+                        expands_footprint=True))
+
     # We need to work on the fully expanded form of an expression.
     # To that end, instantiate a substitutor.
     from loopy.symbolic import ParametrizedSubstitutor
@@ -347,6 +404,9 @@ def precompute(kernel, subst_name, dtype, sweep_axes=[],
     sweep_inames = set()
 
     for invdesc in invocation_descriptors:
+        if not invdesc.expands_footprint:
+            continue
+
         for swaxis in sweep_axes:
             if isinstance(swaxis, int):
                 sweep_inames.update(
@@ -369,6 +429,9 @@ def precompute(kernel, subst_name, dtype, sweep_axes=[],
 
     usage_arg_deps = set()
     for invdesc in invocation_descriptors:
+        if not invdesc.expands_footprint:
+            continue
+
         for arg in invdesc.args:
             usage_arg_deps.update(get_dependencies(arg))
 
@@ -515,9 +578,27 @@ def precompute(kernel, subst_name, dtype, sweep_axes=[],
 
     # }}}
 
-    # {{{ substitute rule into expressions in kernel
+    # {{{ substitute rule into expressions in kernel (if within footprint)
+
+    left_unused_subst_rule_invocations = [False]
 
     def do_substs(expr, name, instance, args, rec):
+        if instance != subst_instance:
+            left_unused_subst_rule_invocations[0] = True
+            return expr
+
+        found = False
+        for invdesc in invocation_descriptors:
+            if expr == invdesc.expr:
+                found = True
+                break
+
+        if not invdesc.is_in_footprint:
+            left_unused_subst_rule_invocations[0] = True
+            return expr
+
+        assert found, expr
+
         if len(args) != len(subst.arguments):
             raise ValueError("invocation of '%s' with too few arguments"
                     % name)
@@ -543,23 +624,23 @@ def precompute(kernel, subst_name, dtype, sweep_axes=[],
             new_outer_expr = new_outer_expr[tuple(stor_subscript)]
 
         return new_outer_expr
-        # can't nest, don't recurse
+        # can't possibly be nested, don't recurse
 
     new_insns = [compute_insn]
 
-    sub_map = SubstitutionCallbackMapper([(subst_name, subst_instance)], do_substs)
+    sub_map = SubstitutionCallbackMapper([subst_name], do_substs)
     for insn in kernel.instructions:
         new_insn = insn.copy(expression=sub_map(insn.expression))
         new_insns.append(new_insn)
 
+    # also catch uses of our rule in other substitution rules
     new_substs = dict(
             (s.name, s.copy(expression=sub_map(s.expression)))
-            for s in kernel.substitutions.itervalues()
+            for s in kernel.substitutions.itervalues())
 
-            # leave rule be if instance was specified
-            # (even if it might end up unused--FIXME)
-            if subst_instance is not None 
-            or s.name != subst_name)
+    # If the subst above caught all uses of the subst rule, get rid of it.
+    if not left_unused_subst_rule_invocations[0]:
+        del new_substs[subst_name]
 
     # }}}
 
diff --git a/loopy/kernel.py b/loopy/kernel.py
index f5985bf563a981fca0e9dff6d05749db219c11b1..9b471a110609325ed40c390322a96fe54dd9b13d 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -494,6 +494,9 @@ class LoopKernel(Record):
         objects
     :ivar lowest_priority_inames:
     :ivar breakable_inames: these inames' loops may be broken up by the scheduler
+    :ivar applied_substitutions: A list of past substitution dictionaries that
+        were applied to the kernel. These are stored so that they may be repeated
+        on expressions the user specifies later.
 
     :ivar cache_manager:
 
@@ -510,7 +513,8 @@ class LoopKernel(Record):
             temporary_variables={},
             local_sizes={},
             iname_to_tag={}, iname_to_tag_requests=None, substitutions={},
-            cache_manager=None, lowest_priority_inames=[], breakable_inames=set()):
+            cache_manager=None, lowest_priority_inames=[], breakable_inames=set(),
+            applied_substitutions=[]):
         """
         :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl.
             Example: "{[i,j]: 0<=i < 10 and 0<= j < 9}"
@@ -573,7 +577,7 @@ class LoopKernel(Record):
         # {{{ instruction parser
 
         def parse_if_necessary(insn):
-            from pymbolic import parse
+            from loopy.symbolic import parse
 
             if isinstance(insn, Instruction):
                 insns.append(insn)
@@ -597,8 +601,7 @@ class LoopKernel(Record):
                 raise RuntimeError("insn parse error")
 
             lhs = parse(groups["lhs"])
-            from loopy.symbolic import FunctionToPrimitiveMapper
-            rhs = FunctionToPrimitiveMapper()(parse(groups["rhs"]))
+            rhs = parse(groups["rhs"])
 
             if insn_match is not None:
                 if groups["label"] is not None:
@@ -707,7 +710,8 @@ class LoopKernel(Record):
                 substitutions=substitutions,
                 cache_manager=cache_manager,
                 lowest_priority_inames=lowest_priority_inames,
-                breakable_inames=breakable_inames)
+                breakable_inames=breakable_inames,
+                applied_substitutions=applied_substitutions)
 
     def make_unique_instruction_id(self, insns=None, based_on="insn", extra_used_ids=set()):
         if insns is None:
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index abffae6e6acf409757d9a8a466639080623f87a8..2b1d584726d27d1d5c85ca44b2f53fb62d98db98 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -117,7 +117,7 @@ class UnidirectionalUnifier(UnidirectionalUnifierBase):
 
 # }}}
 
-# {{{ functions to primitives
+# {{{ functions to primitives, parsing
 
 class FunctionToPrimitiveMapper(IdentityMapper):
     """Looks for invocations of a function called 'cse' or 'reduce' and
@@ -194,6 +194,10 @@ class FunctionToPrimitiveMapper(IdentityMapper):
 
         return Reduction(operation, tuple(processed_inames), red_expr)
 
+def parse(expr_str):
+    from pymbolic import parse
+    return FunctionToPrimitiveMapper()(parse(expr_str))
+
 # }}}
 
 # {{{ reduction loop splitter
diff --git a/test/test_nbody.py b/test/test_nbody.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f9db5f724a46f3ff53a6274b4a4ed04ca68469a
--- /dev/null
+++ b/test/test_nbody.py
@@ -0,0 +1,74 @@
+
+from __future__ import division
+
+import numpy as np
+import pyopencl as cl
+import loopy as lp
+
+from pyopencl.tools import pytest_generate_tests_for_pyopencl \
+        as pytest_generate_tests
+
+
+
+def test_nbody(ctx_factory):
+    dtype = np.float32
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(ctx.devices[0],
+            "[N] -> {[i,j,k]: 0<=i,j<N and 0<=k<3 }",
+           [
+            "axdist(k) := x[i,k]-x[j,k]",
+            "invdist := rsqrt(sum_float32(k, axdist(k)**2))",
+            "pot[i] = sum_float32(j, if(i != j, invdist, 0))",
+            ],
+            [
+            lp.ArrayArg("x", dtype, shape="N,3", order="C"),
+            lp.ArrayArg("pot", dtype, shape="N", order="C"),
+            lp.ScalarArg("N", np.int32),
+            ],
+             name="nbody", assumptions="N>=1")
+
+    seq_knl = knl
+
+    def variant_1(knl):
+        knl = lp.split_dimension(knl, "i", 256,
+                outer_tag="g.0", inner_tag="l.0",
+                slabs=(0,1))
+        knl = lp.split_dimension(knl, "j", 256, slabs=(0,1))
+        return knl, []
+
+    def variant_cpu(knl):
+        knl = lp.split_dimension(knl, "i", 1024,
+                outer_tag="g.0", slabs=(0,1))
+        return knl, []
+
+    def variant_gpu(knl):
+        knl = lp.split_dimension(knl, "i", 256,
+                outer_tag="g.0", inner_tag="l.0", slabs=(0,1))
+        knl = lp.split_dimension(knl, "j", 256, slabs=(0,1))
+        knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None)
+        knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"])
+        return knl, ["j_outer", "j_inner"]
+
+    n = 100
+
+    for variant in [variant_gpu]:
+        variant_knl, loop_prio = variant(knl)
+        kernel_gen = lp.generate_loop_schedules(variant_knl,
+                loop_priority=loop_prio)
+        kernel_gen = lp.check_kernels(kernel_gen, dict(N=n))
+
+        lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
+                op_count=4*n**2*1e-9, op_label="GOps/s",
+                parameters={"N": n}, print_ref_code=True)
+
+
+
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from py.test.cmdline import main
+        main([__file__])