diff --git a/bin/loopy b/bin/loopy
index 57dac8038ca638269249783151e4b195b05db223..ef4be07444d742ab9059674661f60b1fb6262467 100644
--- a/bin/loopy
+++ b/bin/loopy
@@ -24,6 +24,15 @@ def to_python_literal(value):
         # It's a float
         return repr(float(value))
 
+    if value.endswith("f"):
+        try:
+            float(value[:-1])
+        except ValueError:
+            pass
+        else:
+            # It's a float
+            return repr(float(value[:-1]))
+
     return repr(value)
 
 
diff --git a/doc/reference.rst b/doc/reference.rst
index 3d78bee36d3e70231fd306344ba2f119529cd494..59ab3c9864d2af4f3a7b885c5c16dcc10e522a08 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -39,6 +39,7 @@ Loopy's expressions are a slight superset of the expressions supported by
     * duplication of reduction inames
 * complex-valued arithmetic
 * tagging of array access and substitution rule use ("$")
+* ``indexof``, ``indexof_vec``
 
 .. _types:
 
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 25cb764f3cd38c2b10b59edb71d850e28d6da2d6..24cb034630d401ba87ebc10b63c7f47aa498316f 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -997,8 +997,8 @@ transformation exists in :func:`loopy.add_prefetch`:
     >>> evt, (out,) = knl_pf(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
-        a_fetch = a[16 * gid(0) + lid(0)];
         acc_k = 0.0f;
+        a_fetch = a[16 * gid(0) + lid(0)];
         for (int k = 0; k <= 15; ++k)
           acc_k = acc_k + a_fetch;
         out[16 * gid(0) + lid(0)] = acc_k;
@@ -1021,10 +1021,10 @@ earlier:
     >>> evt, (out,) = knl_pf(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
-      if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
-        a_fetch[lid(0)] = a[lid(0) + 16 * gid(0)];
       if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
         acc_k = 0.0f;
+      if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
+        a_fetch[lid(0)] = a[lid(0) + 16 * gid(0)];
       barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn_k_update depends on a_fetch_rule) */;
       if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
       {
@@ -1459,8 +1459,8 @@ Now to make things more interesting, we'll create a kernel with barriers:
     {
       __local int c[50 * 10 * 99];
     <BLANKLINE>
-      for (int i = 0; i <= 49; ++i)
-        for (int j = 0; j <= 9; ++j)
+      for (int j = 0; j <= 9; ++j)
+        for (int i = 0; i <= 49; ++i)
         {
           barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */;
           c[990 * i + 99 * j + lid(0) + 1 + gid(0) * 128] = 2 * a[980 * i + 98 * j + lid(0) + 1 + gid(0) * 128];
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index fd4e5621343b21e78e8beeba21e21588e11a6206..d51f669ca1e9d0afb53d8c39a7a784e772075767 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -320,10 +320,6 @@ def _enumerate_cl_devices_for_ref_test():
     cpu_devs = []
 
     for pf in cl.get_platforms():
-        if pf.name == "Portable Computing Language":
-            # pocl not mature enough yet, sadly
-            continue
-
         for dev in pf.get_devices():
             if dev.type & cl.device_type.CPU:
                 cpu_devs.append(dev)
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 37c8a12ee125fb26c54e84918a538c8d36e0cb4a..225f7e7fec30d38159a93df6ec4df35a45c21bc6 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -561,7 +561,7 @@ def generate_code(kernel, device=None):
     preamble_generators = (kernel.preamble_generators
             + kernel.target.preamble_generators())
     for prea_gen in preamble_generators:
-        preambles.extend(prea_gen(kernel.target, seen_dtypes, seen_functions))
+        preambles.extend(prea_gen(kernel, seen_dtypes, seen_functions))
 
     seen_preamble_tags = set()
     dedup_preambles = []
diff --git a/loopy/expression.py b/loopy/expression.py
index 3194ac571b1959dca77f6451c78a29096824ed48..94eaf4448f8489f55c3185cc3e3ec121bbb08993 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -184,6 +184,9 @@ class TypeInferenceMapper(CombineMapper):
         if isinstance(identifier, Variable):
             identifier = identifier.name
 
+        if identifier in ["indexof", "indexof_vec"]:
+            return self.kernel.index_dtype
+
         arg_dtypes = tuple(self.rec(par) for par in expr.parameters)
 
         mangle_result = self.kernel.mangle_function(identifier, arg_dtypes)
diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py
index 1cd7aa6f3e4945c306442ec09f4077d65759050d..f2bbb288249332dc6a333f95fa2600fa48967e60 100644
--- a/loopy/frontend/fortran/__init__.py
+++ b/loopy/frontend/fortran/__init__.py
@@ -25,7 +25,7 @@ THE SOFTWARE.
 from loopy.diagnostic import LoopyError
 
 
-def c_preprocess(source, defines=None, filename="<floopy source>"):
+def c_preprocess(source, defines=None, filename=None, include_paths=None):
     """
     :arg source: a string, possibly containing C preprocessor constructs
     :arg defines: a list of strings as they might occur after a
@@ -38,10 +38,22 @@ def c_preprocess(source, defines=None, filename="<floopy source>"):
     except ImportError:
         raise LoopyError("Using the C preprocessor requires PLY to be installed")
 
+    input_dirname = None
+    if filename is None:
+        filename = "<floopy source>"
+    else:
+        from os.path import dirname
+        input_dirname = dirname(filename)
+
     lexer = lex.lex(cpp)
 
     from ply.cpp import Preprocessor
     p = Preprocessor(lexer)
+    if input_dirname is not None:
+        p.add_path(input_dirname)
+    if include_paths:
+        for inc_path in include_paths:
+            p.add_path(inc_path)
 
     if defines:
         for d in defines:
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index d2d178bc459cb0c231a78a5a1b2c3b8092d07536..13afaa66d05b8dce89a2eb3f1f06e8b752dc5420 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -302,7 +302,7 @@ class LoopKernel(RecordWithoutPickling):
         manglers = self.target.function_manglers() + self.function_manglers
 
         for mangler in manglers:
-            mangle_result = mangler(self.target, identifier, arg_dtypes)
+            mangle_result = mangler(self, identifier, arg_dtypes)
             if mangle_result is not None:
                 return mangle_result
 
@@ -316,7 +316,7 @@ class LoopKernel(RecordWithoutPickling):
         manglers = self.target.symbol_manglers() + self.symbol_manglers
 
         for mangler in manglers:
-            result = mangler(self.target, identifier)
+            result = mangler(self, identifier)
             if result is not None:
                 return result
 
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 9a3c9c0cfbc79e9aba934b7a7c051665b19c19c5..92fb232e92a6e891efcef7022e0dceedcb4f692f 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -306,8 +306,8 @@ def parse_array_dim_tags(dim_tags, use_increasing_target_axes=False):
                     raise LoopyError("may not mix C/F dim_tag specifications with "
                             "explicit specification of layout nesting levels")
             else:
-                target_axis_to_has_explicit_nesting_level[parsed_dim_tag.target_axis] = \
-                        has_explicit_nesting_level
+                target_axis_to_has_explicit_nesting_level[
+                        parsed_dim_tag.target_axis] = has_explicit_nesting_level
 
             # }}}
 
diff --git a/loopy/library/function.py b/loopy/library/function.py
index e494169bbe5b83df852ea3d483ed3640381891f6..df623a4770f4f14a7952ee2e0edbf59939de1cfd 100644
--- a/loopy/library/function.py
+++ b/loopy/library/function.py
@@ -23,19 +23,19 @@ THE SOFTWARE.
 """
 
 
-def default_function_mangler(target, name, arg_dtypes):
+def default_function_mangler(kernel, name, arg_dtypes):
     from loopy.library.reduction import reduction_function_mangler
 
     manglers = [reduction_function_mangler]
     for mangler in manglers:
-        result = mangler(target, name, arg_dtypes)
+        result = mangler(kernel, name, arg_dtypes)
         if result is not None:
             return result
 
     return None
 
 
-def single_arg_function_mangler(target, name, arg_dtypes):
+def single_arg_function_mangler(kernel, name, arg_dtypes):
     if len(arg_dtypes) == 1:
         dtype, = arg_dtypes
         return dtype, name
diff --git a/loopy/schedule.py b/loopy/schedule.py
index bba42381a5a4f67e957c137a2c866dd816c523c4..f22b95d45275d54d473a97a8f7a0dfde69555d6b 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -440,9 +440,6 @@ class SchedulerState(Record):
         *Note:* ``ilp`` and ``vec`` are not 'parallel' for the purposes of the
         scheduler.  See :attr:`ilp_inames`, :attr:`vec_inames`.
 
-    .. attribute:: loop_priority
-
-
     .. rubric:: Time-varying scheduler state
 
     .. attribute:: active_inames
@@ -536,7 +533,10 @@ def generate_loop_schedules_internal(
 
     def insn_sort_key(insn_id):
         insn = kernel.id_to_insn[insn_id]
-        return (insn.priority, len(active_groups & insn.groups))
+
+        # Sort by insn.id as a last criterion to achieve deterministic
+        # schedule generation order.
+        return (insn.priority, len(active_groups & insn.groups), insn.id)
 
     insn_ids_to_try = sorted(sched_state.unscheduled_insn_ids,
             key=insn_sort_key, reverse=True)
@@ -864,7 +864,11 @@ def generate_loop_schedules_internal(
                 found_viable_schedule = False
 
                 for iname in sorted(tier,
-                        key=lambda iname: iname_to_usefulness.get(iname, 0),
+                        key=lambda iname: (
+                            iname_to_usefulness.get(iname, 0),
+                            # Sort by iname to achieve deterministic
+                            # ordering of generated schedules.
+                            iname),
                         reverse=True):
 
                     for sub_sched in generate_loop_schedules_internal(
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 71844cfbf057c4533669dfbdce71e13f5e0a1ca3..112e7e5b95b3354092a6ced076a9a03140c87105 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -35,6 +35,7 @@ import islpy as isl
 
 from loopy.expression import dtype_to_type_context, TypeInferenceMapper
 
+from loopy.diagnostic import LoopyError
 from loopy.tools import is_integer
 
 
@@ -74,6 +75,22 @@ class LoopyCCodeMapper(RecursiveMapper):
         else:
             return s
 
+    def find_array(self, expr):
+        if expr.aggregate.name in self.kernel.arg_dict:
+            ary = self.kernel.arg_dict[expr.aggregate.name]
+        elif expr.aggregate.name in self.kernel.temporary_variables:
+            ary = self.kernel.temporary_variables[expr.aggregate.name]
+        else:
+            raise RuntimeError("nothing known about subscripted variable '%s'"
+                    % expr.aggregate.name)
+
+        from loopy.kernel.array import ArrayBase
+        if not isinstance(ary, ArrayBase):
+            raise RuntimeError("subscripted variable '%s' is not an array"
+                    % expr.aggregate.name)
+
+        return ary
+
     def rec(self, expr, prec, type_context=None, needed_dtype=None):
         if needed_dtype is None:
             return RecursiveMapper.rec(self, expr, prec, type_context)
@@ -150,18 +167,7 @@ class LoopyCCodeMapper(RecursiveMapper):
         if not isinstance(expr.aggregate, Variable):
             return base_impl(expr, enclosing_prec, type_context)
 
-        if expr.aggregate.name in self.kernel.arg_dict:
-            ary = self.kernel.arg_dict[expr.aggregate.name]
-        elif expr.aggregate.name in self.kernel.temporary_variables:
-            ary = self.kernel.temporary_variables[expr.aggregate.name]
-        else:
-            raise RuntimeError("nothing known about subscripted variable '%s'"
-                    % expr.aggregate.name)
-
-        from loopy.kernel.array import ArrayBase
-        if not isinstance(ary, ArrayBase):
-            raise RuntimeError("subscripted variable '%s' is not an array"
-                    % expr.aggregate.name)
+        ary = self.find_array(expr)
 
         from loopy.kernel.array import get_access_info
         from pymbolic import evaluate
@@ -367,11 +373,54 @@ class LoopyCCodeMapper(RecursiveMapper):
                         "for constant '%s'" % expr)
 
     def map_call(self, expr, enclosing_prec, type_context):
-        from pymbolic.primitives import Variable
+        from pymbolic.primitives import Variable, Subscript
         from pymbolic.mapper.stringifier import PREC_NONE
 
         identifier = expr.function
 
+        # {{{ implement indexof, indexof_vec
+
+        if identifier.name in ["indexof", "indexof_vec"]:
+            if len(expr.parameters) != 1:
+                raise LoopyError("%s takes exactly one argument" % identifier.name)
+            arg, = expr.parameters
+            if not isinstance(arg, Subscript):
+                raise LoopyError(
+                        "argument to %s must be a subscript" % identifier.name)
+
+            ary = self.find_array(arg)
+
+            from loopy.kernel.array import get_access_info
+            from pymbolic import evaluate
+            access_info = get_access_info(self.kernel.target, ary, arg.index,
+                    lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
+                    self.codegen_state.vectorization_info)
+
+            from loopy.kernel.data import ImageArg
+            if isinstance(ary, ImageArg):
+                raise LoopyError("%s does not support images" % identifier.name)
+
+            if identifier.name == "indexof":
+                return access_info.subscripts[0]
+            elif identifier.name == "indexof_vec":
+                from loopy.kernel.array import VectorArrayDimTag
+                ivec = None
+                for iaxis, dim_tag in enumerate(ary.dim_tags):
+                    if isinstance(dim_tag, VectorArrayDimTag):
+                        ivec = iaxis
+
+                if ivec is None:
+                    return access_info.subscripts[0]
+                else:
+                    return (
+                        access_info.subscripts[0]*ary.shape[ivec]
+                        + access_info.vector_index)
+
+            else:
+                raise RuntimeError("should not get here")
+
+        # }}}
+
         c_name = None
         if isinstance(identifier, Variable):
             identifier = identifier.name
diff --git a/loopy/target/opencl/__init__.py b/loopy/target/opencl/__init__.py
index eebe6f5da0b81fa9b4c1ac7b4cda0ba8b1ac283e..d038c329a9eff73a95458fde44078c26f3dbbc56 100644
--- a/loopy/target/opencl/__init__.py
+++ b/loopy/target/opencl/__init__.py
@@ -105,7 +105,7 @@ def _register_vector_types(dtype_registry):
 
 # {{{ function mangler
 
-def opencl_function_mangler(target, name, arg_dtypes):
+def opencl_function_mangler(kernel, name, arg_dtypes):
     if not isinstance(name, str):
         return None
 
@@ -134,7 +134,7 @@ def opencl_function_mangler(target, name, arg_dtypes):
 
 # {{{ symbol mangler
 
-def opencl_symbol_mangler(target, name):
+def opencl_symbol_mangler(kernel, name):
     # FIXME: should be more picky about exact names
     if name.startswith("FLT_"):
         return np.dtype(np.float32), name
@@ -155,7 +155,7 @@ def opencl_symbol_mangler(target, name):
 
 # {{{ preamble generator
 
-def opencl_preamble_generator(target, seen_dtypes, seen_functions):
+def opencl_preamble_generator(kernel, seen_dtypes, seen_functions):
     has_double = False
 
     for dtype in seen_dtypes:
@@ -229,7 +229,7 @@ class OpenCLTarget(CTarget):
     def is_vector_dtype(self, dtype):
         return list(vec.types.values())
 
-    def get_vector_dtype(self, base, count):
+    def vector_dtype(self, base, count):
         return vec.types[base, count]
 
     def wrap_function_declaration(self, kernel, fdecl):
diff --git a/setup.cfg b/setup.cfg
index 2dc94705f792ddca37ca7bf1f8ddec67be7ed3b9..d34ecdd6d469b931929187c36be61a2ba4be25ad 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,3 +1,3 @@
 [flake8]
-ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503
+ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402
 max-line-length=85
diff --git a/test/test_loopy.py b/test/test_loopy.py
index d7d7dc5768f163a84306ef234af08cf620c4066c..c8072108032e82517773709f5f5fd257928d3bd9 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2191,6 +2191,43 @@ def test_variable_size_temporary():
         lp.generate_code(k)
 
 
+def test_indexof(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+         ''' { [i,j]: 0<=i,j<5 } ''',
+         ''' out[i,j] = indexof(out[i,j])''')
+
+    knl = lp.set_options(knl, write_cl=True)
+
+    (evt, (out,)) = knl(queue)
+    out = out.get()
+
+    assert np.array_equal(out.ravel(order="C"), np.arange(25))
+
+
+def test_indexof_vec(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    if ctx.devices[0].platform.name.startswith("Portable"):
+        # Accurate as of 2015-10-08
+        pytest.skip("POCL miscompiles vector code")
+
+    knl = lp.make_kernel(
+         ''' { [i,j,k]: 0<=i,j,k<4 } ''',
+         ''' out[i,j,k] = indexof_vec(out[i,j,k])''')
+
+    knl = lp.tag_inames(knl, {"i": "vec"})
+    knl = lp.tag_data_axes(knl, "out", "vec,c,c")
+    knl = lp.set_options(knl, write_cl=True)
+
+    (evt, (out,)) = knl(queue)
+    #out = out.get()
+    #assert np.array_equal(out.ravel(order="C"), np.arange(25))
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])