From a8aa6521358255d3e5ede0bfb5968552e66503f0 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 18 Sep 2019 23:25:40 -0500
Subject: [PATCH] Merge 'kernel_callables_v3' into 'kernel_callables_v3-edit1'

---
 doc/tutorial.rst                              |   4 +-
 .../fortran/ipython-integration-demo.ipynb    |  17 +-
 examples/fortran/matmul.floopy                |   4 +-
 examples/fortran/sparse.floopy                |   4 +-
 examples/fortran/tagging.floopy               |   4 +-
 examples/fortran/volumeKernel.floopy          |   4 +-
 loopy/__init__.py                             |  14 +-
 loopy/check.py                                |   8 +-
 loopy/frontend/fortran/__init__.py            |  53 ++++-
 loopy/ipython_ext.py                          |   2 +-
 loopy/kernel/creation.py                      |  94 ++++----
 loopy/kernel/instruction.py                   |   4 +-
 loopy/library/reduction.py                    | 193 ++++++++++++----
 loopy/preprocess.py                           | 216 ++++++++++--------
 loopy/program.py                              |  64 +++---
 loopy/symbolic.py                             |  12 +-
 loopy/target/opencl.py                        |  16 +-
 loopy/transform/callable.py                   |  32 ++-
 loopy/transform/fusion.py                     |   5 +
 loopy/type_inference.py                       |   2 +-
 test/test_callables.py                        |  71 +++---
 test/test_fortran.py                          |   8 +-
 test/test_numa_diff.py                        |  20 +-
 23 files changed, 520 insertions(+), 331 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index befa5e30b..e6ef54b66 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1157,7 +1157,7 @@ this, :mod:`loopy` will complain that global barrier needs to be inserted:
    >>> cgr = lp.generate_code_v2(knl)
    Traceback (most recent call last):
    ...
-   loopy.diagnostic.MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed)
+   loopy.diagnostic.MissingBarrierError: rotate_v1: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed)
 
 The syntax for a inserting a global barrier instruction is
 ``... gbarrier``. :mod:`loopy` also supports manually inserting local
@@ -1554,7 +1554,7 @@ information provided. Now we will count the operations:
 
     >>> op_map = lp.get_op_map(knl, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(op_map))
-    Op(np:dtype('float32'), add, subgroup) : ...
+    Op(np:dtype('float32'), add, subgroup, loopy_kernel) : ...
 
 Each line of output will look roughly like::
 
diff --git a/examples/fortran/ipython-integration-demo.ipynb b/examples/fortran/ipython-integration-demo.ipynb
index 7a5c8257b..1b0a9df8d 100644
--- a/examples/fortran/ipython-integration-demo.ipynb
+++ b/examples/fortran/ipython-integration-demo.ipynb
@@ -62,9 +62,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "split_amount = 128"
@@ -91,7 +89,7 @@
     "\n",
     "!$loopy begin\n",
     "!\n",
-    "! tr_fill, = lp.parse_fortran(SOURCE)\n",
+    "! tr_fill = lp.parse_fortran(SOURCE)\n",
     "! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n",
     "!     outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
     "! RESULT = [tr_fill]\n",
@@ -107,15 +105,6 @@
    "source": [
     "print(tr_fill)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
@@ -134,7 +123,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.4"
+   "version": "3.6.8"
   }
  },
  "nbformat": 4,
diff --git a/examples/fortran/matmul.floopy b/examples/fortran/matmul.floopy
index 4b3552204..a8377bedd 100644
--- a/examples/fortran/matmul.floopy
+++ b/examples/fortran/matmul.floopy
@@ -13,7 +13,7 @@ subroutine dgemm(m,n,l,alpha,a,b,c)
 end subroutine
 
 !$loopy begin
-! dgemm, = lp.parse_fortran(SOURCE, FILENAME)
+! dgemm = lp.parse_fortran(SOURCE, FILENAME)
 ! dgemm = lp.split_iname(dgemm, "i", 16,
 !         outer_tag="g.0", inner_tag="l.1")
 ! dgemm = lp.split_iname(dgemm, "j", 8,
@@ -24,5 +24,5 @@ end subroutine
 ! dgemm = lp.extract_subst(dgemm, "b_acc", "b[i1,i2]", parameters="i1, i2")
 ! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner", default_tag="l.auto")
 ! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner", default_tag="l.auto")
-! RESULT = [dgemm]
+! RESULT = dgemm
 !$loopy end
diff --git a/examples/fortran/sparse.floopy b/examples/fortran/sparse.floopy
index 18542e6b0..2b156bdd7 100644
--- a/examples/fortran/sparse.floopy
+++ b/examples/fortran/sparse.floopy
@@ -23,11 +23,11 @@ subroutine sparse(rowstarts, colindices, values, m, n, nvals, x, y)
 end
 
 !$loopy begin
-! sparse, = lp.parse_fortran(SOURCE, FILENAME)
+! sparse = lp.parse_fortran(SOURCE, FILENAME)
 ! sparse = lp.split_iname(sparse, "i", 128)
 ! sparse = lp.tag_inames(sparse, {"i_outer": "g.0"})
 ! sparse = lp.tag_inames(sparse, {"i_inner": "l.0"})
 ! sparse = lp.split_iname(sparse, "j", 4)
 ! sparse = lp.tag_inames(sparse, {"j_inner": "unr"})
-! RESULT = [sparse]
+! RESULT = sparse
 !$loopy end
diff --git a/examples/fortran/tagging.floopy b/examples/fortran/tagging.floopy
index 87aacba68..c7ebb7566 100644
--- a/examples/fortran/tagging.floopy
+++ b/examples/fortran/tagging.floopy
@@ -23,13 +23,13 @@ end
 !       "factor 4.0",
 !       "real_type real*8",
 !       ])
-! fill, = lp.parse_fortran(SOURCE, FILENAME)
+! fill = lp.parse_fortran(SOURCE, FILENAME)
 ! fill = lp.add_barrier(fill, "tag:init", "tag:mult", "gb1")
 ! fill = lp.split_iname(fill, "i", 128,
 !     outer_tag="g.0", inner_tag="l.0")
 ! fill = lp.split_iname(fill, "i_1", 128,
 !     outer_tag="g.0", inner_tag="l.0")
-! RESULT = [fill]
+! RESULT = fill
 !
 !$loopy end
 
diff --git a/examples/fortran/volumeKernel.floopy b/examples/fortran/volumeKernel.floopy
index c5784b634..211c38049 100644
--- a/examples/fortran/volumeKernel.floopy
+++ b/examples/fortran/volumeKernel.floopy
@@ -67,7 +67,7 @@ end subroutine volumeKernel
 
 !$loopy begin
 !
-! volumeKernel, = lp.parse_fortran(SOURCE, FILENAME)
+! volumeKernel = lp.parse_fortran(SOURCE, FILENAME)
 ! volumeKernel = lp.split_iname(volumeKernel,
 !     "e", 32, outer_tag="g.1", inner_tag="g.0")
 ! volumeKernel = lp.fix_parameters(volumeKernel,
@@ -76,6 +76,6 @@ end subroutine volumeKernel
 !     i="l.0", j="l.1", k="l.2",
 !     i_1="l.0", j_1="l.1", k_1="l.2"
 !     ))
-! RESULT = [volumeKernel]
+! RESULT = volumeKernel
 !
 !$loopy end
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 1439cb1ff..058bc93ef 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -130,10 +130,10 @@ from loopy.type_inference import infer_unknown_types
 from loopy.preprocess import (preprocess_kernel, realize_reduction,
         preprocess_program)
 from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
-from loopy.statistics import (ToCountMap, CountGranularity,
-        Op, MemAccess, get_op_map, get_mem_access_map,
-        get_synchronization_map,
-        gather_access_footprints, gather_access_footprint_bytes)
+from loopy.statistics import (ToCountMap, ToCountPolynomialMap,
+        CountGranularity, stringify_stats_mapping, Op, MemAccess, get_op_map,
+        get_mem_access_map, get_synchronization_map,
+        gather_access_footprints, gather_access_footprint_bytes, Sync)
 from loopy.codegen import (
         PreambleInfo,
         generate_code, generate_code_v2, generate_body)
@@ -269,9 +269,11 @@ __all__ = [
         "PreambleInfo",
         "generate_code", "generate_code_v2", "generate_body",
 
-        "ToCountMap", "CountGranularity", "Op",
-        "MemAccess", "get_op_map", "get_mem_access_map", "get_synchronization_map",
+        "ToCountMap", "ToCountPolynomialMap", "CountGranularity",
+        "stringify_stats_mapping", "Op", "MemAccess", "get_op_map",
+        "get_mem_access_map", "get_synchronization_map",
         "gather_access_footprints", "gather_access_footprint_bytes",
+        "Sync",
 
         "CompiledKernel",
 
diff --git a/loopy/check.py b/loopy/check.py
index d1ee125df..83e4fd0af 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -33,8 +33,6 @@ from loopy.type_inference import TypeInferenceMapper
 from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction,
         CInstruction, _DataObliviousInstruction)
 
-from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction,
-        _DataObliviousInstruction)
 from functools import reduce
 
 import logging
@@ -145,9 +143,9 @@ class SubscriptIndicesIsIntChecker(TypeInferenceMapper):
         return self.rec(expr.aggregate)
 
 
-def check_for_integer_subscript_indices(kernel):
+def check_for_integer_subscript_indices(kernel, callables_table):
     from pymbolic.primitives import Subscript
-    idx_int_checker = SubscriptIndicesIsIntChecker(kernel)
+    idx_int_checker = SubscriptIndicesIsIntChecker(kernel, callables_table)
     for insn in kernel.instructions:
         if isinstance(insn, MultiAssignmentBase):
             idx_int_checker(insn.expression, return_tuple=isinstance(insn,
@@ -763,7 +761,7 @@ def pre_schedule_checks(kernel, callables_table):
     try:
         logger.debug("%s: pre-schedule check: start" % kernel.name)
 
-        check_for_integer_subscript_indices(kernel)
+        check_for_integer_subscript_indices(kernel, callables_table)
         check_for_duplicate_insn_ids(kernel)
         check_for_orphaned_user_hardware_axes(kernel)
         check_for_double_use_of_hw_axes(kernel, callables_table)
diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py
index 3516ca29a..74c1ebf54 100644
--- a/loopy/frontend/fortran/__init__.py
+++ b/loopy/frontend/fortran/__init__.py
@@ -241,10 +241,54 @@ def parse_transformed_fortran(source, free_form=True, strict=True,
     return proc_dict["RESULT"]
 
 
+def _add_assignees_to_calls(knl, all_kernels):
+    new_insns = []
+    subroutine_dict = dict((kernel.name, kernel) for kernel in all_kernels)
+    from loopy.kernel.instruction import (Assignment, CallInstruction,
+            CInstruction, _DataObliviousInstruction,
+            modify_assignee_for_array_call)
+    from pymbolic.primitives import Call, Variable
+
+    for insn in knl.instructions:
+        if isinstance(insn, CallInstruction):
+            if isinstance(insn.expression, Call) and (
+                    insn.expression.function.name in subroutine_dict):
+                assignees = []
+                new_params = []
+                subroutine = subroutine_dict[insn.expression.function.name]
+                for par, arg in zip(insn.expression.parameters, subroutine.args):
+                    if arg.name in subroutine.get_written_variables():
+                        par = modify_assignee_for_array_call(par)
+                        assignees.append(par)
+                    if arg.name in subroutine.get_read_variables():
+                        new_params.append(par)
+                    if arg.name not in (subroutine.get_written_variables() |
+                            subroutine.get_read_variables()):
+                        new_params.append(par)
+
+                new_insns.append(
+                        insn.copy(
+                            assignees=tuple(assignees),
+                            expression=Variable(
+                                insn.expression.function.name)(*new_params)))
+            else:
+                new_insns.append(insn)
+            pass
+        elif isinstance(insn, (Assignment, CInstruction,
+                _DataObliviousInstruction)):
+            new_insns.append(insn)
+        else:
+            raise NotImplementedError(type(insn).__name__)
+
+    return knl.copy(instructions=new_insns)
+
+
 def parse_fortran(source, filename="<floopy code>", free_form=None, strict=None,
-        seq_dependencies=None, auto_dependencies=None, target=None):
+        seq_dependencies=None, auto_dependencies=None, target=None,
+        return_list_of_knls=False):
     """
-    :returns: a :class:`loopy.Program`
+    :returns: an instance of :class:`list` of :class:`loopy.LoopKernel`s if
+        *return_list_of_knls* is True else a :class:`loopy.Program`.
     """
 
     parse_plog = ProcessLogger(logger, "parsing fortran file '%s'" % filename)
@@ -286,6 +330,11 @@ def parse_fortran(source, filename="<floopy code>", free_form=None, strict=None,
 
     kernels = f2loopy.make_kernels(seq_dependencies=seq_dependencies)
 
+    if return_list_of_knls:
+        return kernels
+
+    kernels = [_add_assignees_to_calls(knl, kernels) for knl in kernels]
+
     from loopy.kernel.tools import identify_root_kernel
     from loopy.program import make_program
     from loopy.transform.callable import register_callable_kernel
diff --git a/loopy/ipython_ext.py b/loopy/ipython_ext.py
index ec1b10f1f..e44b183ed 100644
--- a/loopy/ipython_ext.py
+++ b/loopy/ipython_ext.py
@@ -9,7 +9,7 @@ import loopy as lp
 class LoopyMagics(Magics):
     @cell_magic
     def fortran_kernel(self, line, cell):
-        result = lp.parse_fortran(cell)
+        result = lp.parse_fortran(cell, return_list_of_knls=True)
 
         for knl in result:
             self.shell.user_ns[knl.name] = knl
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 1f896bb97..f36a90575 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -37,6 +37,7 @@ from loopy.kernel.data import (
         SubstitutionRule, AddressSpace, ValueArg)
 from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction,
         CallInstruction)
+from loopy.program import iterate_over_kernels_if_given_program
 from loopy.diagnostic import LoopyError, warn_with_kernel
 import islpy as isl
 from islpy import dim_type
@@ -1753,6 +1754,7 @@ def add_inferred_inames(knl):
 
 # {{{ apply single-writer heuristic
 
+@iterate_over_kernels_if_given_program
 def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True):
     logger.debug("%s: default deps" % kernel.name)
 
@@ -2175,56 +2177,55 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
 
     # {{{ handle kernel language version
 
-    if not is_callee_kernel:
-        from loopy.version import LANGUAGE_VERSION_SYMBOLS
+    from loopy.version import LANGUAGE_VERSION_SYMBOLS
 
-        version_to_symbol = dict(
-                (getattr(loopy.version, lvs), lvs)
-                for lvs in LANGUAGE_VERSION_SYMBOLS)
+    version_to_symbol = dict(
+            (getattr(loopy.version, lvs), lvs)
+            for lvs in LANGUAGE_VERSION_SYMBOLS)
 
-        lang_version = kwargs.pop("lang_version", None)
-        if lang_version is None:
-            # {{{ peek into caller's module to look for LOOPY_KERNEL_LANGUAGE_VERSION
+    lang_version = kwargs.pop("lang_version", None)
+    if lang_version is None:
+        # {{{ peek into caller's module to look for LOOPY_KERNEL_LANGUAGE_VERSION
 
-            # This *is* gross. But it seems like the right thing interface-wise.
-            import inspect
-            caller_globals = inspect.currentframe().f_back.f_globals
+        # This *is* gross. But it seems like the right thing interface-wise.
+        import inspect
+        caller_globals = inspect.currentframe().f_back.f_globals
 
-            for ver_sym in LANGUAGE_VERSION_SYMBOLS:
-                try:
-                    lang_version = caller_globals[ver_sym]
-                    break
-                except KeyError:
-                    pass
+        for ver_sym in LANGUAGE_VERSION_SYMBOLS:
+            try:
+                lang_version = caller_globals[ver_sym]
+                break
+            except KeyError:
+                pass
 
-            # }}}
+        # }}}
 
-            if lang_version is None:
-                from warnings import warn
-                from loopy.diagnostic import LoopyWarning
-                from loopy.version import (
-                        MOST_RECENT_LANGUAGE_VERSION,
-                        FALLBACK_LANGUAGE_VERSION)
-                warn("'lang_version' was not passed to make_kernel(). "
-                        "To avoid this warning, pass "
-                        "lang_version={ver} in this invocation. "
-                        "(Or say 'from loopy.version import "
-                        "{sym_ver}' in "
-                        "the global scope of the calling frame.)"
-                        .format(
-                            ver=MOST_RECENT_LANGUAGE_VERSION,
-                            sym_ver=version_to_symbol[MOST_RECENT_LANGUAGE_VERSION]
-                            ),
-                        LoopyWarning, stacklevel=2)
-
-                lang_version = FALLBACK_LANGUAGE_VERSION
-
-        if lang_version not in version_to_symbol:
-            raise LoopyError("Language version '%s' is not known." % (lang_version,))
-        if lang_version >= (2018, 1):
-            options = options.copy(enforce_variable_access_ordered=True)
-        if lang_version >= (2018, 2):
-            options = options.copy(ignore_boostable_into=True)
+        if lang_version is None:
+            from warnings import warn
+            from loopy.diagnostic import LoopyWarning
+            from loopy.version import (
+                    MOST_RECENT_LANGUAGE_VERSION,
+                    FALLBACK_LANGUAGE_VERSION)
+            warn("'lang_version' was not passed to make_kernel(). "
+                    "To avoid this warning, pass "
+                    "lang_version={ver} in this invocation. "
+                    "(Or say 'from loopy.version import "
+                    "{sym_ver}' in "
+                    "the global scope of the calling frame.)"
+                    .format(
+                        ver=MOST_RECENT_LANGUAGE_VERSION,
+                        sym_ver=version_to_symbol[MOST_RECENT_LANGUAGE_VERSION]
+                        ),
+                    LoopyWarning, stacklevel=2)
+
+            lang_version = FALLBACK_LANGUAGE_VERSION
+
+    if lang_version not in version_to_symbol:
+        raise LoopyError("Language version '%s' is not known." % (lang_version,))
+    if lang_version >= (2018, 1):
+        options = options.copy(enforce_variable_access_ordered=True)
+    if lang_version >= (2018, 2):
+        options = options.copy(ignore_boostable_into=True)
 
     # }}}
 
@@ -2382,11 +2383,6 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
 
 
 def make_function(*args, **kwargs):
-    lang_version = kwargs.pop('lang_version', None)
-    if lang_version:
-        raise LoopyError("lang_version should be set for program, not "
-                "functions.")
-
     kwargs['is_callee_kernel'] = True
     return make_kernel(*args, **kwargs)
 
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index 9d85f5e84..1ba0dc7ec 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -1208,7 +1208,7 @@ def is_array_call(assignees, expression):
     return False
 
 
-def modify_assignee_assignee_for_array_call(assignee):
+def modify_assignee_for_array_call(assignee):
     """
     Converts the assignee subscript or variable as a SubArrayRef.
     """
@@ -1258,7 +1258,7 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs):
             # assignee as an instance of SubArrayRef. If not given as a
             # SubArrayRef
             return CallInstruction(
-                    assignees=tuple(modify_assignee_assignee_for_array_call(
+                    assignees=tuple(modify_assignee_for_array_call(
                         assignee) for assignee in assignees),
                     expression=expression,
                     temp_var_types=temp_var_types,
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index 6c6a0dd9b..504493f4d 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -51,7 +51,7 @@ class ReductionOperation(object):
     def arg_count(self):
         raise NotImplementedError
 
-    def neutral_element(self, *dtypes):
+    def neutral_element(self, dtypes, callables_table, target):
         raise NotImplementedError
 
     def __hash__(self):
@@ -84,9 +84,6 @@ class ReductionOperation(object):
         raise LoopyError("unable to parse reduction type: '%s'"
                 % op_type)
 
-    def get_scalar_callables(self):
-        return frozenset()
-
 
 class ScalarReductionOperation(ReductionOperation):
     def __init__(self, forced_result_type=None):
@@ -128,29 +125,43 @@ class ScalarReductionOperation(ReductionOperation):
 
 
 class SumReductionOperation(ScalarReductionOperation):
-    def neutral_element(self, dtype):
+    def neutral_element(self, dtype, callables_table, target):
         # FIXME: Document that we always use an int here.
-        return 0
+        from loopy import auto
+        if dtype not in [None, auto] and dtype.numpy_dtype.kind == 'f':
+            return 0.0, callables_table
 
-    def __call__(self, dtype, operand1, operand2):
-        return operand1 + operand2
+        return 0, callables_table
+
+    def __call__(self, dtype, operand1, operand2, callables_table, target):
+        return operand1 + operand2, callables_table
 
 
 class ProductReductionOperation(ScalarReductionOperation):
-    def neutral_element(self, dtype):
+    def neutral_element(self, dtype, callables_table, target):
         # FIXME: Document that we always use an int here.
-        return 1
+        from loopy import auto
+        if dtype not in [None, auto] and dtype.numpy_dtype.kind == 'f':
+            return 1.0, callables_table
 
-    def __call__(self, dtype, operand1, operand2):
-        return operand1 * operand2
+        return 1, callables_table
+
+    def __call__(self, dtype, operand1, operand2, callables_table, target):
+        return operand1 * operand2, callables_table
 
 
 def get_le_neutral(dtype):
     """Return a number y that satisfies (x <= y) for all y."""
 
     if dtype.numpy_dtype.kind == "f":
-        # OpenCL 1.1, section 6.11.2
-        return var("INFINITY")
+        # OpenCL 1.2, section 6.12.2
+        if dtype.numpy_dtype.itemsize == 4:
+            #float
+            return var("INFINITY")
+        elif dtype.numpy_dtype.itemsize == 8:
+            #double
+            return var("HUGE_VAL")
+
     elif dtype.numpy_dtype.kind == "i":
         # OpenCL 1.1, section 6.11.3
         if dtype.numpy_dtype.itemsize == 4:
@@ -167,8 +178,13 @@ def get_ge_neutral(dtype):
     """Return a number y that satisfies (x >= y) for all y."""
 
     if dtype.numpy_dtype.kind == "f":
-        # OpenCL 1.1, section 6.11.2
-        return -var("INFINITY")
+        # OpenCL 1.2, section 6.12.2
+        if dtype.numpy_dtype.itemsize == 4:
+            #float
+            return -var("INFINITY")
+        elif dtype.numpy_dtype.itemsize == 8:
+            #double
+            return -var("HUGE_VAL")
     elif dtype.numpy_dtype.kind == "i":
         # OpenCL 1.1, section 6.11.3
         if dtype.numpy_dtype.itemsize == 4:
@@ -182,25 +198,53 @@ def get_ge_neutral(dtype):
 
 
 class MaxReductionOperation(ScalarReductionOperation):
-    def neutral_element(self, dtype):
-        return get_ge_neutral(dtype)
+    def neutral_element(self, dtype, callables_table, target):
+        return get_ge_neutral(dtype), callables_table
 
-    def __call__(self, dtype, operand1, operand2):
-        return ResolvedFunction("max")(operand1, operand2)
+    def __call__(self, dtype, operand1, operand2, callables_table, target):
+        dtype, = dtype
+
+        # getting the callable 'max' from target
+        from loopy.program import (find_in_knl_callable_from_identifier,
+                _default_func_id_to_kernel_callable_mappers)
+        max_scalar_callable = find_in_knl_callable_from_identifier(
+                _default_func_id_to_kernel_callable_mappers(target),
+                target, "max")
+
+        # type specialize the callable
+        max_scalar_callable, callables_table = max_scalar_callable.with_types(
+                {0: dtype, 1: dtype}, None, callables_table)
 
-    def get_scalar_callables(self):
-        return frozenset(["max"])
+        # populate callables_table
+        callables_table, func_id = callables_table.with_added_callable(
+                'max', max_scalar_callable)
+
+        return ResolvedFunction(func_id)(operand1, operand2), callables_table
 
 
 class MinReductionOperation(ScalarReductionOperation):
-    def neutral_element(self, dtype):
-        return get_le_neutral(dtype)
+    def neutral_element(self, dtype, callables_table, target):
+        return get_le_neutral(dtype), callables_table
 
-    def __call__(self, dtype, operand1, operand2):
-        return ResolvedFunction("min")(operand1, operand2)
+    def __call__(self, dtype, operand1, operand2, callables_table, target):
+        dtype, = dtype
+
+        # getting the callable 'max' from target
+        from loopy.program import (find_in_knl_callable_from_identifier,
+                _default_func_id_to_kernel_callable_mappers)
+        min_scalar_callable = find_in_knl_callable_from_identifier(
+                _default_func_id_to_kernel_callable_mappers(target),
+                target, "min")
+
+        # type specialize the callable
+        min_scalar_callable, callables_table = min_scalar_callable.with_types(
+                {0: dtype, 1: dtype}, None, callables_table)
 
-    def get_scalar_callables(self):
-        return frozenset(["min"])
+        # populate callables_table
+        callables_table, func_id = callables_table.with_added_callable(
+                'min', min_scalar_callable)
+
+        return ResolvedFunction(func_id)(operand1, operand2), callables_table
 
 
 # {{{ base class for symbolic reduction ops
@@ -259,10 +303,26 @@ class _SegmentedScalarReductionOperation(ReductionOperation):
                 scalar_dtype.numpy_dtype.type.__name__,
                 segment_flag_dtype.numpy_dtype.type.__name__)
 
-    def neutral_element(self, scalar_dtype, segment_flag_dtype):
-        scalar_neutral_element = self.inner_reduction.neutral_element(scalar_dtype)
-        return ResolvedFunction("make_tuple")(scalar_neutral_element,
-                segment_flag_dtype.numpy_dtype.type(0))
+    def neutral_element(self, scalar_dtype, segment_flag_dtype,
+            callables_table, target):
+        scalar_neutral_element, calables_table = (
+                self.inner_reduction.neutral_element(
+                    scalar_dtype, callables_table, target))
+
+        from loopy.program import (find_in_knl_callable_from_identifier,
+                _default_func_id_to_kernel_callable_mappers)
+        make_tuple_scalar_callable = find_in_knl_callable_from_identifier(
+                _default_func_id_to_kernel_callable_mappers(target),
+                target, "make_tuple")
+        make_tuple_scalar_callable, _ = (
+                make_tuple_scalar_callable.with_types(
+                    dict(enumerate([scalar_dtype, segment_flag_dtype])), None,
+                    None))
+        callables_table, func_id = callables_table.with_added_callable(
+                "make_tuple", make_tuple_scalar_callable)
+
+        return ResolvedFunction(func_id)(scalar_neutral_element,
+                segment_flag_dtype.numpy_dtype.type(0)), callables_table
 
     def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype):
         return (self.inner_reduction.result_dtypes(kernel, scalar_dtype)
@@ -277,11 +337,27 @@ class _SegmentedScalarReductionOperation(ReductionOperation):
     def __eq__(self, other):
         return type(self) == type(other)
 
-    def __call__(self, dtypes, operand1, operand2):
-        return ResolvedFunction(SegmentedOp(self))(*(operand1 + operand2))
+    def __call__(self, dtypes, operand1, operand2, callables_table, target):
+        # getting the callable 'max' from target
+
+        from loopy.program import (find_in_knl_callable_from_identifier,
+                _default_func_id_to_kernel_callable_mappers)
+        segmented_scalar_callable = find_in_knl_callable_from_identifier(
+                _default_func_id_to_kernel_callable_mappers(target),
+                target, SegmentedOp(self))
+
+        # type specialize the callable
+        segmented_scalar_callable, callables_table = (
+                segmented_scalar_callable.with_types(
+                    {0: dtypes[0], 1: dtypes[1], 2: dtypes[0], 3: dtypes[1]},
+                    None, callables_table))
 
-    def get_scalar_callables(self):
-        return frozenset(["make_tuple", SegmentedOp(self)])
+        # populate callables_table
+        callables_table, func_id = callables_table.with_added_callable(
+                SegmentedOp(self), segmented_scalar_callable)
+
+        return (ResolvedFunction(func_id)(*(operand1 + operand2)),
+                callables_table)
 
 
 class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation):
@@ -335,12 +411,27 @@ class _ArgExtremumReductionOperation(ReductionOperation):
     def result_dtypes(self, kernel, scalar_dtype, index_dtype):
         return (scalar_dtype, index_dtype)
 
-    def neutral_element(self, scalar_dtype, index_dtype):
+    def neutral_element(self, scalar_dtype, index_dtype, callables_table,
+            target):
         scalar_neutral_func = (
                 get_ge_neutral if self.neutral_sign < 0 else get_le_neutral)
         scalar_neutral_element = scalar_neutral_func(scalar_dtype)
-        return ResolvedFunction("make_tuple")(scalar_neutral_element,
-                index_dtype.numpy_dtype.type(-1))
+
+        from loopy.program import (find_in_knl_callable_from_identifier,
+                _default_func_id_to_kernel_callable_mappers)
+
+        make_tuple_scalar_callable = find_in_knl_callable_from_identifier(
+                _default_func_id_to_kernel_callable_mappers(target),
+                target, "make_tuple")
+        make_tuple_scalar_callable, _ = (
+                make_tuple_scalar_callable.with_types(
+                    dict(enumerate([scalar_dtype, index_dtype])), None,
+                    None))
+        callables_table, func_id = callables_table.with_added_callable(
+                "make_tuple", make_tuple_scalar_callable)
+
+        return ResolvedFunction(func_id)(scalar_neutral_element,
+                index_dtype.numpy_dtype.type(-1)), callables_table
 
     def __str__(self):
         return self.which
@@ -355,11 +446,27 @@ class _ArgExtremumReductionOperation(ReductionOperation):
     def arg_count(self):
         return 2
 
-    def __call__(self, dtypes, operand1, operand2):
-        return ResolvedFunction(ArgExtOp(self))(*(operand1 + operand2))
+    def __call__(self, dtypes, operand1, operand2, callables_table, target):
+        # getting the callable 'max' from target
+
+        from loopy.program import (find_in_knl_callable_from_identifier,
+                _default_func_id_to_kernel_callable_mappers)
+        arg_ext_scalar_callable = find_in_knl_callable_from_identifier(
+                _default_func_id_to_kernel_callable_mappers(target),
+                target, ArgExtOp(self))
+
+        # type specialize the callable
+        arg_ext_scalar_callable, callables_table = (
+                arg_ext_scalar_callable.with_types(
+                    {0: dtypes[0], 1: dtypes[1], 2: dtypes[0], 3: dtypes[1]},
+                    None, callables_table))
+
+        # populate callables_table
+        callables_table, func_id = callables_table.with_added_callable(
+                ArgExtOp(self), arg_ext_scalar_callable)
 
-    def get_scalar_callables(self):
-        return frozenset([self.which, "make_tuple", ArgExtOp(self)])
+        return (ResolvedFunction(func_id)(*(operand1 + operand2)),
+                callables_table)
 
 
 class ArgMaxReductionOperation(_ArgExtremumReductionOperation):
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index de620ef9a..c6b69da83 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -38,8 +38,7 @@ from loopy.version import DATA_MODEL_VERSION
 from loopy.kernel.data import make_assignment, filter_iname_tags_by_type
 # for the benefit of loopy.statistics, for now
 from loopy.type_inference import infer_unknown_types
-from loopy.symbolic import RuleAwareIdentityMapper
-
+from loopy.symbolic import RuleAwareIdentityMapper, ReductionCallbackMapper
 from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction,
         CallInstruction,  _DataObliviousInstruction)
 from loopy.program import Program, iterate_over_kernels_if_given_program
@@ -899,6 +898,18 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain):
 # }}}
 
 
+class RealizeReductionCallbackMapper(ReductionCallbackMapper):
+    def __init__(self, callback, callables_table):
+        super(RealizeReductionCallbackMapper, self).__init__(
+                callback)
+        self.callables_table = callables_table
+
+    def map_reduction(self, expr, **kwargs):
+        result, self.callables_table = self.callback(expr, self.rec,
+                **kwargs)
+        return result
+
+
 def realize_reduction_for_single_kernel(kernel, callables_table,
         insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False,
         force_scan=False, force_outer_iname_for_scan=None):
@@ -1046,13 +1057,16 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
         init_id = insn_id_gen(
                 "%s_%s_init" % (insn.id, "_".join(expr.inames)))
 
+        expression, callables_table = expr.operation.neutral_element(
+                *arg_dtypes, callables_table=callables_table, target=kernel.target)
+
         init_insn = make_assignment(
                 id=init_id,
                 assignees=acc_vars,
                 within_inames=outer_insn_inames - frozenset(expr.inames),
                 within_inames_is_final=insn.within_inames_is_final,
                 depends_on=init_insn_depends_on,
-                expression=expr.operation.neutral_element(*arg_dtypes),
+                expression=expression,
                 predicates=insn.predicates,)
 
         generated_insns.append(init_insn)
@@ -1087,13 +1101,17 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
         else:
             reduction_expr = expr.expr
 
+        expression, callables_table = expr.operation(
+                arg_dtypes,
+                _strip_if_scalar(acc_vars, acc_vars),
+                reduction_expr,
+                callables_table,
+                kernel.target)
+
         reduction_insn = make_assignment(
                 id=update_id,
                 assignees=acc_vars,
-                expression=expr.operation(
-                    arg_dtypes,
-                    _strip_if_scalar(acc_vars, acc_vars),
-                    reduction_expr),
+                expression=expression,
                 depends_on=frozenset(reduction_insn_depends_on) | insn.depends_on,
                 within_inames=update_insn_iname_deps,
                 within_inames_is_final=insn.within_inames_is_final,
@@ -1105,9 +1123,9 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
 
         if nresults == 1:
             assert len(acc_vars) == 1
-            return acc_vars[0]
+            return acc_vars[0], callables_table
         else:
-            return acc_vars
+            return acc_vars, callables_table
 
     # }}}
 
@@ -1190,7 +1208,8 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
 
         base_iname_deps = outer_insn_inames - frozenset(expr.inames)
 
-        neutral = expr.operation.neutral_element(*arg_dtypes)
+        neutral, callables_table = expr.operation.neutral_element(*arg_dtypes,
+                callables_table=callables_table, target=kernel.target)
         init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname))
         init_insn = make_assignment(
                 id=init_id,
@@ -1243,17 +1262,20 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
             reduction_expr = expr.expr
 
         transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, red_iname))
+        expression, callables_table = expr.operation(
+                arg_dtypes,
+                _strip_if_scalar(
+                    neutral_var_names,
+                    tuple(var(nvn) for nvn in neutral_var_names)),
+                reduction_expr,
+                callables_table,
+                kernel.target)
         transfer_insn = make_assignment(
                 id=transfer_id,
                 assignees=tuple(
                     acc_var[outer_local_iname_vars + (var(red_iname),)]
                     for acc_var in acc_vars),
-                expression=expr.operation(
-                    arg_dtypes,
-                    _strip_if_scalar(
-                        neutral_var_names,
-                        tuple(var(nvn) for nvn in neutral_var_names)),
-                    reduction_expr),
+                expression=expression,
                 within_inames=(
                     (outer_insn_inames - frozenset(expr.inames))
                     | frozenset([red_iname])),
@@ -1282,22 +1304,26 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
             new_iname_tags[stage_exec_iname] = kernel.iname_tags(red_iname)
 
             stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage))
+            expression, callables_table = expr.operation(
+                    arg_dtypes,
+                    _strip_if_scalar(acc_vars, tuple(
+                        acc_var[
+                            outer_local_iname_vars + (var(stage_exec_iname),)]
+                        for acc_var in acc_vars)),
+                    _strip_if_scalar(acc_vars, tuple(
+                        acc_var[
+                            outer_local_iname_vars + (
+                                var(stage_exec_iname) + new_size,)]
+                        for acc_var in acc_vars)),
+                    callables_table,
+                    kernel.target)
+
             stage_insn = make_assignment(
                     id=stage_id,
                     assignees=tuple(
                         acc_var[outer_local_iname_vars + (var(stage_exec_iname),)]
                         for acc_var in acc_vars),
-                    expression=expr.operation(
-                        arg_dtypes,
-                        _strip_if_scalar(acc_vars, tuple(
-                            acc_var[
-                                outer_local_iname_vars + (var(stage_exec_iname),)]
-                            for acc_var in acc_vars)),
-                        _strip_if_scalar(acc_vars, tuple(
-                            acc_var[
-                                outer_local_iname_vars + (
-                                    var(stage_exec_iname) + new_size,)]
-                            for acc_var in acc_vars))),
+                    expression=expression,
                     within_inames=(
                         base_iname_deps | frozenset([stage_exec_iname])),
                     within_inames_is_final=insn.within_inames_is_final,
@@ -1318,9 +1344,10 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
 
         if nresults == 1:
             assert len(acc_vars) == 1
-            return acc_vars[0][outer_local_iname_vars + (0,)]
+            return acc_vars[0][outer_local_iname_vars + (0,)], callables_table
         else:
-            return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars]
+            return [acc_var[outer_local_iname_vars + (0,)] for acc_var in
+                    acc_vars], callables_table
     # }}}
 
     # {{{ utils (stateful)
@@ -1414,6 +1441,9 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
         if global_barrier is not None:
             init_insn_depends_on |= frozenset([global_barrier])
 
+        expression, callables_table = expr.operation.neutral_element(
+                *arg_dtypes, callables_table=callables_table, target=kernel.target)
+
         init_insn = make_assignment(
                 id=init_id,
                 assignees=acc_vars,
@@ -1421,7 +1451,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
                     (sweep_iname,) + expr.inames),
                 within_inames_is_final=insn.within_inames_is_final,
                 depends_on=init_insn_depends_on,
-                expression=expr.operation.neutral_element(*arg_dtypes),
+                expression=expression,
                 predicates=insn.predicates,
                 )
 
@@ -1440,13 +1470,17 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
         if insn.within_inames_is_final:
             update_insn_iname_deps = insn.within_inames | set([track_iname])
 
+        expression, callables_table = expr.operation(
+                arg_dtypes,
+                _strip_if_scalar(acc_vars, acc_vars),
+                _strip_if_scalar(acc_vars, updated_inner_exprs),
+                callables_table,
+                kernel.target)
+
         scan_insn = make_assignment(
                 id=update_id,
                 assignees=acc_vars,
-                expression=expr.operation(
-                    arg_dtypes,
-                    _strip_if_scalar(acc_vars, acc_vars),
-                    _strip_if_scalar(acc_vars, updated_inner_exprs)),
+                expression=expression,
                 depends_on=frozenset(update_insn_depends_on),
                 within_inames=update_insn_iname_deps,
                 no_sync_with=insn.no_sync_with,
@@ -1460,9 +1494,9 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
 
         if nresults == 1:
             assert len(acc_vars) == 1
-            return acc_vars[0]
+            return acc_vars[0], callables_table
         else:
-            return acc_vars
+            return acc_vars, callables_table
 
     # }}}
 
@@ -1536,7 +1570,8 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
         base_iname_deps = (outer_insn_inames
                 - frozenset(expr.inames) - frozenset([sweep_iname]))
 
-        neutral = expr.operation.neutral_element(*arg_dtypes)
+        neutral, callables_table = expr.operation.neutral_element(
+                *arg_dtypes, callables_table=callables_table, target=kernel.target)
 
         init_insn_depends_on = insn.depends_on
 
@@ -1635,19 +1670,23 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
 
             write_stage_id = insn_id_gen(
                     "scan_%s_write_stage_%d" % (scan_iname, istage))
+
+            expression, callables_table = expr.operation(
+                arg_dtypes,
+                _strip_if_scalar(acc_vars, read_vars),
+                _strip_if_scalar(acc_vars, tuple(
+                    acc_var[
+                        outer_local_iname_vars + (var(stage_exec_iname),)]
+                    for acc_var in acc_vars)),
+                callables_table,
+                kernel.target)
+
             write_stage_insn = make_assignment(
                     id=write_stage_id,
                     assignees=tuple(
                         acc_var[outer_local_iname_vars + (var(stage_exec_iname),)]
                         for acc_var in acc_vars),
-                    expression=expr.operation(
-                        arg_dtypes,
-                        _strip_if_scalar(acc_vars, read_vars),
-                        _strip_if_scalar(acc_vars, tuple(
-                            acc_var[
-                                outer_local_iname_vars + (var(stage_exec_iname),)]
-                            for acc_var in acc_vars))
-                        ),
+                    expression=expression,
                     within_inames=(
                         base_iname_deps | frozenset([stage_exec_iname])),
                     within_inames_is_final=insn.within_inames_is_final,
@@ -1668,10 +1707,11 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
 
         if nresults == 1:
             assert len(acc_vars) == 1
-            return acc_vars[0][outer_local_iname_vars + (output_idx,)]
+            return (acc_vars[0][outer_local_iname_vars + (output_idx,)],
+                    callables_table)
         else:
             return [acc_var[outer_local_iname_vars + (output_idx,)]
-                    for acc_var in acc_vars]
+                    for acc_var in acc_vars], callables_table
 
     # }}}
 
@@ -1765,7 +1805,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
             # to reduce over. It's rather similar to an array with () shape in
             # numpy.)
 
-            return expr.expr
+            return expr.expr, callables_table
 
         # }}}
 
@@ -1833,8 +1873,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
 
     # }}}
 
-    from loopy.symbolic import ReductionCallbackMapper
-    cb_mapper = ReductionCallbackMapper(map_reduction)
+    cb_mapper = RealizeReductionCallbackMapper(map_reduction, callables_table)
 
     insn_queue = kernel.instructions[:]
     insn_id_replacements = {}
@@ -1862,13 +1901,14 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
         # Run reduction expansion.
         from loopy.symbolic import Reduction
         if isinstance(insn.expression, Reduction) and nresults > 1:
+            # FIXME[KK]: With the new mapper emitting callables_table
+            # something should be done.
             new_expressions = cb_mapper(insn.expression,
                     callables_table=callables_table,
                     nresults=nresults)
         else:
-            new_expressions = (
-                    cb_mapper(insn.expression,
-                        callables_table=callables_table),)
+            new_expressions = cb_mapper(insn.expression,
+                    callables_table=callables_table),
 
         if generated_insns:
             # An expansion happened, so insert the generated stuff plus
@@ -1955,32 +1995,28 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
             _hackily_ensure_multi_assignment_return_values_are_scoped_private(
                 kernel))
 
-    return kernel
+    return kernel, cb_mapper.callables_table
 
 
 def realize_reduction(program, *args, **kwargs):
     assert isinstance(program, Program)
 
-    new_resolved_functions = {}
-    for func_id, in_knl_callable in program.callables_table.items():
-        if isinstance(in_knl_callable, CallableKernel):
-            new_subkernel = realize_reduction_for_single_kernel(
-                    in_knl_callable.subkernel, program.callables_table,
-                    *args, **kwargs)
-            in_knl_callable = in_knl_callable.copy(
-                    subkernel=new_subkernel)
-
-        elif isinstance(in_knl_callable, ScalarCallable):
-            pass
-        else:
-            raise NotImplementedError("Unknown type of callable %s." % (
-                type(in_knl_callable).__name__))
-
-        new_resolved_functions[func_id] = in_knl_callable
+    callables_table = program.callables_table.copy()
+    kernels_to_scan = [in_knl_callable.subkernel for in_knl_callable in
+            program.callables_table.values() if isinstance(in_knl_callable,
+                CallableKernel)]
+
+    for knl in kernels_to_scan:
+        new_knl, callables_table = realize_reduction_for_single_kernel(
+                knl, callables_table, *args, **kwargs)
+        in_knl_callable = callables_table[knl.name].copy(
+                subkernel=new_knl)
+        resolved_functions = callables_table.resolved_functions.copy()
+        resolved_functions[knl.name] = in_knl_callable
+        callables_table = callables_table.copy(
+            resolved_functions=resolved_functions)
 
-    new_callables_table = program.callables_table.copy(
-            resolved_functions=new_resolved_functions)
-    return program.copy(callables_table=new_callables_table)
+    return program.copy(callables_table=callables_table)
 
 # }}}
 
@@ -2338,9 +2374,6 @@ def preprocess_single_kernel(kernel, callables_table, device=None):
 
     # }}}
 
-    from loopy.transform.subst import expand_subst
-    kernel = expand_subst(kernel)
-
     # Ordering restriction:
     # Type inference and reduction iname uniqueness don't handle substitutions.
     # Get them out of the way.
@@ -2348,20 +2381,6 @@ def preprocess_single_kernel(kernel, callables_table, device=None):
     check_for_writes_to_predicates(kernel)
     check_reduction_iname_uniqueness(kernel)
 
-    from loopy.kernel.creation import apply_single_writer_depencency_heuristic
-    kernel = apply_single_writer_depencency_heuristic(kernel)
-
-    # Ordering restrictions:
-    #
-    # - realize_reduction must happen after type inference because it needs
-    #   to be able to determine the types of the reduced expressions.
-    #
-    # - realize_reduction must happen after default dependencies are added
-    #   because it manipulates the depends_on field, which could prevent
-    #   defaults from being applied.
-    kernel = realize_reduction_for_single_kernel(kernel,
-            callables_table, unknown_types_ok=False)
-
     # Ordering restriction:
     # add_axes_to_temporaries_for_ilp because reduction accumulators
     # need to be duplicated by this.
@@ -2451,6 +2470,23 @@ def preprocess_program(program, device=None):
 
     program = infer_unknown_types(program, expect_completion=False)
 
+    from loopy.transform.subst import expand_subst
+    program = expand_subst(program)
+
+    from loopy.kernel.creation import apply_single_writer_depencency_heuristic
+    program = apply_single_writer_depencency_heuristic(program)
+
+    # Ordering restrictions:
+    #
+    # - realize_reduction must happen after type inference because it needs
+    #   to be able to determine the types of the reduced expressions.
+    #
+    # - realize_reduction must happen after default dependencies are added
+    #   because it manipulates the depends_on field, which could prevent
+    #   defaults from being applied.
+
+    program = realize_reduction(program, unknown_types_ok=False)
+
     # {{{ preprocess callable kernels
 
     # Callable editing restrictions:
diff --git a/loopy/program.py b/loopy/program.py
index 1fb691531..191a13fa1 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -56,6 +56,25 @@ __doc__ = """
 """
 
 
+def find_in_knl_callable_from_identifier(
+        function_id_to_in_knl_callable_mappers, target, identifier):
+    """
+    Returns an instance of
+    :class:`loopy.kernel.function_interface.InKernelCallable` if the
+    :arg:`identifier` is known to any kernel function scoper, otherwise returns
+    *None*.
+    """
+    for func_id_to_in_knl_callable_mapper in (
+            function_id_to_in_knl_callable_mappers):
+        # fixme: do we really need to given target for the function
+        in_knl_callable = func_id_to_in_knl_callable_mapper(
+                target, identifier)
+        if in_knl_callable is not None:
+            return in_knl_callable
+
+    return None
+
+
 class ResolvedFunctionMarker(RuleAwareIdentityMapper):
     """
     Mapper to convert the  ``function`` attribute of a
@@ -82,23 +101,6 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
         self.function_id_to_in_knl_callable_mappers = (
                 function_id_to_in_knl_callable_mappers)
 
-    def find_in_knl_callable_from_identifier(self, identifier):
-        """
-        Returns an instance of
-        :class:`loopy.kernel.function_interface.InKernelCallable` if the
-        :arg:`identifier` is known to any kernel function scoper, otherwise returns
-        *None*.
-        """
-        for func_id_to_in_knl_callable_mapper in (
-                self.function_id_to_in_knl_callable_mappers):
-            # fixme: do we really need to given target for the function
-            in_knl_callable = func_id_to_in_knl_callable_mapper(
-                    self.kernel.target, identifier)
-            if in_knl_callable is not None:
-                return in_knl_callable
-
-        return None
-
     def map_call(self, expr, expn_state):
         from loopy.symbolic import parse_tagged_name
 
@@ -117,7 +119,9 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
         if not isinstance(expr.function, ResolvedFunction):
 
             # search the kernel for the function.
-            in_knl_callable = self.find_in_knl_callable_from_identifier(
+            in_knl_callable = find_in_knl_callable_from_identifier(
+                    self.function_id_to_in_knl_callable_mappers,
+                    self.kernel.target,
                     expr.function.name)
 
             if in_knl_callable:
@@ -140,16 +144,6 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
         return super(ResolvedFunctionMarker, self).map_call_with_kwargs(expr,
                 expn_state)
 
-    def map_reduction(self, expr, expn_state):
-        for func_id in (
-                expr.operation.get_scalar_callables()):
-            in_knl_callable = self.find_in_knl_callable_from_identifier(func_id)
-            assert in_knl_callable is not None
-            self.callables_table, _ = (
-                    self.callables_table.with_added_callable(func_id,
-                        in_knl_callable))
-        return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state)
-
 
 def _default_func_id_to_kernel_callable_mappers(target):
     """
@@ -525,8 +519,7 @@ class CallablesCountingMapper(CombineMapper):
     map_call_with_kwargs = map_call
 
     def map_reduction(self, expr):
-        return Counter(expr.operation.get_scalar_callables()) + (
-                super(CallablesCountingMapper, self).map_reduction(expr))
+        return super(CallablesCountingMapper, self).map_reduction(expr)
 
     def map_constant(self, expr):
         return Counter()
@@ -774,13 +767,18 @@ class CallablesTable(ImmutableRecord):
         # {{{ non-edit mode
 
         if not self.is_being_edited:
-            if function.name in self.resolved_functions and (
-                    self.resolved_functions[function.name] == in_kernel_callable):
+            if isinstance(function, ReductionOpFunction):
+                function_name = function
+            else:
+                function_name = function.name
+
+            if function_name in self.resolved_functions and (
+                    self.resolved_functions[function_name] == in_kernel_callable):
                 # if not being edited, check that the given function is
                 # equal to the old version of the callable.
                 return self, function
             else:
-                print('Old: ', self.resolved_functions[function.name])
+                print('Old: ', self.resolved_functions[function_name])
                 print('New: ', in_kernel_callable)
                 raise LoopyError("Use 'with_enter_edit_callables_mode' first.")
 
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 6f3c6f2be..870f9fc2c 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -719,7 +719,7 @@ class RuleArgument(LoopyExpressionBase):
     mapper_method = intern("map_rule_argument")
 
 
-class ResolvedFunction(p.Expression):
+class ResolvedFunction(LoopyExpressionBase):
     """
     A function invocation whose definition is known in a :mod:`loopy` kernel.
     Each instance of :class:`loopy.symbolic.ResolvedFunction` in an expression
@@ -758,8 +758,8 @@ class ResolvedFunction(p.Expression):
     def __getinitargs__(self):
         return (self.function, )
 
-    def stringifier(self):
-        return StringifyMapper
+    def make_stringifier(self, originating_stringifier=None):
+        return StringifyMapper()
 
     mapper_method = intern("map_resolved_function")
 
@@ -807,7 +807,7 @@ class SweptInameStrideCollector(CoefficientCollectorBase):
         return super(SweptInameStrideCollector, self).map_algebraic_leaf(expr)
 
 
-class SubArrayRef(p.Expression):
+class SubArrayRef(LoopyExpressionBase):
     """
     An algebraic expression to map an affine memory layout pattern (known as
     sub-arary) as consecutive elements of the sweeping axes which are defined
@@ -871,8 +871,8 @@ class SubArrayRef(p.Expression):
                 and other.subscript == self.subscript
                 and other.swept_inames == self.swept_inames)
 
-    def stringifier(self):
-        return StringifyMapper
+    def make_stringifier(self, originating_stringifier=None):
+        return StringifyMapper()
 
     mapper_method = intern("map_sub_array_ref")
 
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 10161378b..82478a268 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -25,6 +25,7 @@ THE SOFTWARE.
 """
 
 import numpy as np
+import six
 
 from loopy.target.c import CTarget, CASTBuilder
 from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
@@ -183,14 +184,17 @@ class OpenCLCallable(ScalarCallable):
                 return (
                         self.copy(arg_id_to_dtype=arg_id_to_dtype),
                         callables_table)
-            dtype = np.find_common_type(
+            common_dtype = np.find_common_type(
                     [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items()
                         if (id >= 0 and dtype is not None)])
 
-            if dtype.kind in ['u', 'i', 'f']:
-                if dtype.kind == 'f':
+            if common_dtype.kind in ['u', 'i', 'f']:
+                if common_dtype.kind == 'f':
                     name = 'f'+name
-                dtype = NumpyType(dtype)
+
+                target = [dtype.target for dtype in six.itervalues(arg_id_to_dtype)
+                        if (id >= 0 and dtype is not None)][0]
+                dtype = NumpyType(common_dtype, target)
                 return (
                         self.copy(name_in_target=name,
                             arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}),
@@ -198,7 +202,7 @@ class OpenCLCallable(ScalarCallable):
             else:
                 # Unsupported type.
                 raise LoopyError("%s function not supported for the types %s" %
-                        (name, dtype))
+                        (name, common_dtype))
 
         if name == "dot":
             for id in arg_id_to_dtype:
@@ -319,6 +323,8 @@ def opencl_symbol_mangler(kernel, name):
         return NumpyType(np.dtype(np.int32)), name
     elif name.startswith("LONG_"):
         return NumpyType(np.dtype(np.int64)), name
+    elif name == "HUGE_VAL":
+        return NumpyType(np.dtype(np.float64)), name
     else:
         return None
 
diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py
index 479843697..7534818d7 100644
--- a/loopy/transform/callable.py
+++ b/loopy/transform/callable.py
@@ -50,7 +50,7 @@ __doc__ = """
 
 # {{{ register function lookup
 
-def _resolved_callables_from_function_lookup(program,
+def _resolve_callables_from_function_lookup(program,
         func_id_to_in_kernel_callable_mapper):
     """
     Returns a copy of *program* with the expression nodes marked "Resolved"
@@ -124,7 +124,7 @@ def register_function_id_to_in_knl_callable_mapper(program,
         new_func_id_mappers = program.func_id_to_in_knl_callable_mappers + (
                 [func_id_to_in_knl_callable_mapper])
 
-    program = _resolved_callables_from_function_lookup(program,
+    program = _resolve_callables_from_function_lookup(program,
             func_id_to_in_knl_callable_mapper)
 
     new_program = program.copy(
@@ -173,11 +173,17 @@ def register_callable_kernel(program, callee_kernel):
     # the number of assigness in the callee kernel intructions.
     expected_num_assignees = len([arg for arg in callee_kernel.args if
         arg.name in callee_kernel.get_written_variables()])
-    expected_num_parameters = len([arg for arg in callee_kernel.args if
+    expected_max_num_parameters = len([arg for arg in callee_kernel.args if
         arg.name in callee_kernel.get_read_variables()]) + len(
                 [arg for arg in callee_kernel.args if arg.name not in
                     (callee_kernel.get_read_variables() |
                         callee_kernel.get_written_variables())])
+    expected_min_num_parameters = len([arg for arg in callee_kernel.args if
+        arg.name in callee_kernel.get_read_variables() and arg.name not in
+        callee_kernel.get_written_variables()]) + len(
+                [arg for arg in callee_kernel.args if arg.name not in
+                    (callee_kernel.get_read_variables() |
+                        callee_kernel.get_written_variables())])
     for in_knl_callable in program.callables_table.values():
         if isinstance(in_knl_callable, CallableKernel):
             caller_kernel = in_knl_callable.subkernel
@@ -195,11 +201,21 @@ def register_callable_kernel(program, callee_kernel):
                                 "match." % (
                                     callee_kernel.name, insn.id))
                     if len(insn.expression.parameters+tuple(
-                            kw_parameters.values())) != expected_num_parameters:
-                        raise LoopyError("The number of expected arguments "
-                                "for the callee kernel %s and the number of "
-                                "parameters in instruction %s do not match."
-                                % (callee_kernel.name, insn.id))
+                            kw_parameters.values())) > expected_max_num_parameters:
+                        raise LoopyError("The number of"
+                                " parameters in instruction '%s' exceed"
+                                " the max. number of arguments possible"
+                                " for the callee kernel '%s' => arg matching"
+                                " not possible."
+                                % (insn.id, callee_kernel.name))
+                    if len(insn.expression.parameters+tuple(
+                            kw_parameters.values())) < expected_min_num_parameters:
+                        raise LoopyError("The number of"
+                                " parameters in instruction '%s' is less than"
+                                " the min. number of arguments possible"
+                                " for the callee kernel '%s' => arg matching"
+                                " not possible."
+                                % (insn.id, callee_kernel.name))
 
                 elif isinstance(insn, (MultiAssignmentBase, CInstruction,
                         _DataObliviousInstruction)):
diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py
index 9b83f242b..45e9c0a06 100644
--- a/loopy/transform/fusion.py
+++ b/loopy/transform/fusion.py
@@ -419,6 +419,11 @@ def fuse_kernels(programs, suffixes=None, data_flow=None):
         *data_flow* was added in version 2016.2
     """
 
+    from loopy.program import make_program
+
+    programs = [make_program(knl) if isinstance(knl, LoopKernel) else knl for
+            knl in programs]
+
     # all the resolved functions in programs must be registered in
     # main_callables_table
     main_prog_callables_info = (
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 281dcb43d..2101fd2fc 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -998,7 +998,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, callables_table,
             # functions
             if _instruction_missed_during_inference(insn):
                 type_inf_mapper(insn.expression,
-                        return_tuple=len(insn.assignees) > 1,
+                        return_tuple=len(insn.assignees) != 1,
                         return_dtype_set=True)
         elif isinstance(insn, (_DataObliviousInstruction,
                 lp.CInstruction)):
diff --git a/test/test_callables.py b/test/test_callables.py
index f2f3acbd6..731593ea3 100644
--- a/test/test_callables.py
+++ b/test/test_callables.py
@@ -63,38 +63,35 @@ def test_register_function_lookup(ctx_factory):
 def test_register_knl(ctx_factory, inline):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
-    n = 2 ** 4
+    n = 4
 
     x = np.random.rand(n, n, n, n, n)
     y = np.random.rand(n, n, n, n, n)
 
     grandchild_knl = lp.make_function(
-            "{[i, j]:0<= i, j< 16}",
+            "{[i, j]:0<= i, j< 4}",
             """
             c[i, j] = 2*a[i, j] + 3*b[i, j]
             """, name='linear_combo1')
 
     child_knl = lp.make_function(
-            "{[i, j]:0<=i, j < 16}",
+            "{[i, j]:0<=i, j < 4}",
             """
             [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j])
             """, name='linear_combo2')
 
     parent_knl = lp.make_kernel(
-            "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}",
+            "{[i, j, k, l, m]: 0<=i, j, k, l, m<4}",
             """
             [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m],
                                                      [j, l]: y[i, j, k, l, m])
             """,
             kernel_data=[
                 lp.GlobalArg(
-                    name='x',
+                    name='x, y',
                     dtype=np.float64,
-                    shape=(16, 16, 16, 16, 16)),
-                lp.GlobalArg(
-                    name='y',
-                    dtype=np.float64,
-                    shape=(16, 16, 16, 16, 16)), '...'],
+                    shape=(n, n, n, n, n)),
+                '...']
             )
 
     knl = lp.register_callable_kernel(
@@ -115,36 +112,29 @@ def test_register_knl(ctx_factory, inline):
 def test_slices_with_negative_step(ctx_factory, inline):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
-    n = 2 ** 4
+    n = 4
 
     x = np.random.rand(n, n, n, n, n)
     y = np.random.rand(n, n, n, n, n)
 
     child_knl = lp.make_function(
-            "{[i, j]:0<=i, j < 16}",
+            "{[i, j]:0<=i, j < 4}",
             """
             g[i, j] = 2*e[i, j] + 3*f[i, j]
             """, name="linear_combo")
 
     parent_knl = lp.make_kernel(
-            "{[i, k, m]: 0<=i, k, m<16}",
+            "{[i, k, m]: 0<=i, k, m<4}",
             """
-            z[i, 15:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m],
+            z[i, 3:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m],
                                                    y[i, :, k, :, m])
             """,
             kernel_data=[
                 lp.GlobalArg(
-                    name='x',
-                    dtype=np.float64,
-                    shape=(16, 16, 16, 16, 16)),
-                lp.GlobalArg(
-                    name='y',
-                    dtype=np.float64,
-                    shape=(16, 16, 16, 16, 16)),
-                lp.GlobalArg(
-                    name='z',
+                    name='x, y, z',
                     dtype=np.float64,
-                    shape=(16, 16, 16, 16, 16)), '...'],
+                    shape=(n, n, n, n, n)),
+                '...']
             )
 
     knl = lp.register_callable_kernel(
@@ -163,7 +153,7 @@ def test_register_knl_with_call_with_kwargs(ctx_factory, inline):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
-    n = 2 ** 2
+    n = 4
 
     a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32)
     b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32)
@@ -215,27 +205,27 @@ def test_register_knl_with_hw_axes(ctx_factory, inline):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
-    n = 2 ** 5
+    n = 4
 
     x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64)
     y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64)
 
     callee_knl = lp.make_function(
-            "{[i, j]:0<=i, j < 32}",
+            "{[i, j]:0<=i, j < 4}",
             """
             g[i, j] = 2*e[i, j] + 3*f[i, j]
             """, name='linear_combo')
 
-    callee_knl = lp.split_iname(callee_knl, "i", 2, inner_tag="l.0", outer_tag="g.0")
+    callee_knl = lp.split_iname(callee_knl, "i", 1, inner_tag="l.0", outer_tag="g.0")
 
     caller_knl = lp.make_kernel(
-            "{[i, j, k, l, m]: 0<=i, j, k, l, m<32}",
+            "{[i, j, k, l, m]: 0<=i, j, k, l, m<4}",
             """
             [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m],
                                                      [j, l]: y[i, j, k, l, m])
             """
             )
-    caller_knl = lp.split_iname(caller_knl, "i", 8, inner_tag="l.1", outer_tag="g.1")
+    caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1")
 
     knl = lp.register_callable_kernel(
             caller_knl, callee_knl)
@@ -252,8 +242,8 @@ def test_register_knl_with_hw_axes(ctx_factory, inline):
     x_host = x_dev.get()
     y_host = y_dev.get()
 
-    assert gsize == (16, 4)
-    assert lsize == (2, 8)
+    assert gsize == (4, 1)
+    assert lsize == (1, 4)
     assert np.linalg.norm(2*x_host+3*y_host-out['z'].get())/np.linalg.norm(
             2*x_host+3*y_host) < 1e-15
 
@@ -484,13 +474,13 @@ def test_empty_sub_array_refs(ctx_factory, inline):
 def test_array_inputs_to_callee_kernels(ctx_factory, inline):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
-    n = 2 ** 4
+    n = 2 ** 3
 
     x = np.random.rand(n, n)
     y = np.random.rand(n, n)
 
     child_knl = lp.make_function(
-            "{[i, j]:0<=i, j < 16}",
+            "{[i, j]:0<=i, j < 8}",
             """
             g[i, j] = 2*e[i, j] + 3*f[i, j]
             """, name="linear_combo")
@@ -502,17 +492,10 @@ def test_array_inputs_to_callee_kernels(ctx_factory, inline):
             """,
             kernel_data=[
                 lp.GlobalArg(
-                    name='x',
-                    dtype=np.float64,
-                    shape=(16, 16)),
-                lp.GlobalArg(
-                    name='y',
-                    dtype=np.float64,
-                    shape=(16, 16)),
-                lp.GlobalArg(
-                    name='z',
+                    name='x, y, z',
                     dtype=np.float64,
-                    shape=(16, 16)), '...'],
+                    shape=(n, n)),
+                '...']
             )
 
     knl = lp.register_callable_kernel(
diff --git a/test/test_fortran.py b/test/test_fortran.py
index 437199810..1ab28409b 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -533,9 +533,11 @@ def test_parse_and_fuse_two_kernels():
 
         !$loopy begin
         !
-        ! prg = lp.parse_fortran(SOURCE)
-        ! fill = prg["fill"]
-        ! twice = prg["twice"]
+        ! # FIXME: correct this after the "Module" is done.
+        ! # prg = lp.parse_fortran(SOURCE)
+        ! # fill = prg["fill"]
+        ! # twice = prg["twice"]
+        ! fill, twice = lp.parse_fortran(SOURCE, return_list_of_knls=True)
         ! knl = lp.fuse_kernels((fill, twice))
         ! print(knl)
         ! RESULT = knl
diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index 1ba44e77e..55a2d2e11 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -60,7 +60,8 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
     source = source.replace("datafloat", "real*4")
 
     hsv_r, hsv_s = [
-           knl for knl in lp.parse_fortran(source, filename, seq_dependencies=False)
+           knl for knl in lp.parse_fortran(source, filename,
+               seq_dependencies=False, return_list_of_knls=True)
            if "KernelR" in knl.name or "KernelS" in knl.name
            ]
     hsv_r = lp.tag_instructions(hsv_r, "rknl")
@@ -229,6 +230,15 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
 
     hsv = tap_hsv
 
+    hsv = lp.set_options(hsv,
+            ignore_boostable_into=True,
+            cl_build_options=[
+                 "-cl-denorms-are-zero",
+                 "-cl-fast-relaxed-math",
+                 "-cl-finite-math-only",
+                 "-cl-mad-enable",
+                 "-cl-no-signed-zeros"])
+
     if 1:
         print("OPS")
         op_map = lp.get_op_map(hsv, subgroup_size=32)
@@ -238,14 +248,6 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
         gmem_map = lp.get_mem_access_map(hsv, subgroup_size=32).to_bytes()
         print(lp.stringify_stats_mapping(gmem_map))
 
-    hsv = lp.set_options(hsv, cl_build_options=[
-         "-cl-denorms-are-zero",
-         "-cl-fast-relaxed-math",
-         "-cl-finite-math-only",
-         "-cl-mad-enable",
-         "-cl-no-signed-zeros",
-         ])
-
     # FIXME: renaming's a bit tricky in this program model.
     # add a simple transformation for it
     # hsv = hsv.copy(name="horizontalStrongVolumeKernel")
-- 
GitLab