From 2f430adffb1d2eb4933f2c6ec93eb951f3927c19 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kgk2@illinois.edu>
Date: Mon, 2 Jul 2018 20:24:57 -0500
Subject: [PATCH] Hunk edits to isolate the new function interface

---
 doc/index.rst                        |   1 +
 loopy/__init__.py                    |   8 +
 loopy/check.py                       | 102 +++++++-
 loopy/codegen/__init__.py            |  54 ++++
 loopy/kernel/__init__.py             |  49 ++--
 loopy/kernel/creation.py             | 156 +++++++++++-
 loopy/kernel/tools.py                |   8 +
 loopy/library/function.py            |  39 +++
 loopy/library/random123.py           | 104 ++++----
 loopy/library/reduction.py           | 216 +++++++---------
 loopy/preprocess.py                  | 359 +++++++++++++++++++++++++++
 loopy/statistics.py                  |   9 +-
 loopy/symbolic.py                    |  86 ++++++-
 loopy/target/__init__.py             |   7 +-
 loopy/target/c/__init__.py           | 233 ++++++++---------
 loopy/target/c/codegen/expression.py |  84 ++-----
 loopy/target/cuda.py                 |  84 +++++--
 loopy/target/opencl.py               | 182 +++++++++-----
 loopy/target/pyopencl.py             | 110 +++++---
 loopy/target/python.py               |  52 ++--
 loopy/transform/diff.py              |   9 +-
 loopy/type_inference.py              | 183 ++++++++++++--
 test/testlib.py                      |  40 +++
 23 files changed, 1616 insertions(+), 559 deletions(-)

diff --git a/doc/index.rst b/doc/index.rst
index d862a8acd..0644b34c4 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -68,6 +68,7 @@ Please check :ref:`installation` to get started.
     ref_creation
     ref_kernel
     ref_transform
+    ref_call
     ref_other
     misc
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index f50ce237c..d541f1dae 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -51,6 +51,8 @@ from loopy.kernel.data import (
         TemporaryVariable,
         SubstitutionRule,
         CallMangleInfo)
+from loopy.kernel.function_interface import (
+        ScalarCallable)
 
 from loopy.kernel import LoopKernel, KernelState, kernel_state
 from loopy.kernel.tools import (
@@ -119,6 +121,8 @@ from loopy.transform.batch import to_batched
 from loopy.transform.parameter import assume, fix_parameters
 from loopy.transform.save import save_and_reload_temporaries
 from loopy.transform.add_barrier import add_barrier
+from loopy.transform.callable import register_function_lookup
+
 # }}}
 
 from loopy.type_inference import infer_unknown_types
@@ -168,6 +172,8 @@ __all__ = [
         "CallInstruction", "CInstruction", "NoOpInstruction",
         "BarrierInstruction",
 
+        "ScalarCallable",
+
         "KernelArgument",
         "ValueArg", "ArrayArg", "GlobalArg", "ConstantArg", "ImageArg",
         "AddressSpace", "temp_var_scope",   # temp_var_scope is deprecated
@@ -230,6 +236,8 @@ __all__ = [
 
         "add_barrier",
 
+        "register_function_lookup",
+
         # }}}
 
         "get_dot_dependency_graph",
diff --git a/loopy/check.py b/loopy/check.py
index 84f3b04e0..dd96c1ba6 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -27,9 +27,13 @@ from six.moves import range
 
 from islpy import dim_type
 import islpy as isl
-from loopy.symbolic import WalkMapper
+from loopy.symbolic import WalkMapper, CombineMapper, ScopedFunction
 from loopy.diagnostic import LoopyError, WriteRaceConditionWarning, warn_with_kernel
 
+from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction,
+        _DataObliviousInstruction)
+from functools import reduce
+
 import logging
 logger = logging.getLogger(__name__)
 
@@ -55,6 +59,74 @@ def check_identifiers_in_subst_rules(knl):
                     "kernel-global identifiers"
                     % (knl.name, ", ".join(deps-rule_allowed_identifiers)))
 
+
+class UnscopedCallCollector(CombineMapper):
+    """
+    Collects all the unscoped calls within a kernel.
+
+    :returns:
+        An :class:`frozenset` of function names that are not scoped in
+        the kernel.
+
+    .. note::
+        :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are
+        never scoped in the pipeline.
+    """
+
+    def combine(self, values):
+        import operator
+        return reduce(operator.or_, values, frozenset())
+
+    def map_call(self, expr):
+        from loopy.library.reduction import ArgExtOp
+        if not isinstance(expr.function, (ScopedFunction, ArgExtOp)):
+            return (frozenset([expr.function.name]) |
+                    self.combine((self.rec(child) for child in expr.parameters)))
+        else:
+            return self.combine((self.rec(child) for child in expr.parameters))
+
+    def map_call_with_kwargs(self, expr):
+        if not isinstance(expr.function, ScopedFunction):
+            return (frozenset([expr.function.name]) |
+                    self.combine((self.rec(child) for child in expr.parameters
+                        + tuple(expr.kw_parameters.values()))))
+        else:
+            return self.combine((self.rec(child) for child in
+                expr.parameters+tuple(expr.kw_parameters.values())))
+
+    def map_constant(self, expr):
+        return frozenset()
+
+    map_variable = map_constant
+    map_function_symbol = map_constant
+    map_tagged_variable = map_constant
+    map_type_cast = map_constant
+
+
+def check_functions_are_scoped(kernel):
+    """ Checks if all the calls in the instruction expression have been scoped,
+    otherwise indicates to what all calls we await signature. Refer
+    :class:`loopy.symbolic.ScopedFunction` for a detailed explanation of a
+    scoped function.
+    """
+
+    from loopy.symbolic import SubstitutionRuleExpander
+    subst_expander = SubstitutionRuleExpander(kernel.substitutions)
+
+    for insn in kernel.instructions:
+        if isinstance(insn, MultiAssignmentBase):
+            unscoped_calls = UnscopedCallCollector()(subst_expander(
+                insn.expression))
+            if unscoped_calls:
+                raise LoopyError("Unknown function '%s' obtained -- register a "
+                        "function or a kernel corresponding to it." %
+                        set(unscoped_calls).pop())
+        elif isinstance(insn, (CInstruction, _DataObliviousInstruction)):
+            pass
+        else:
+            raise NotImplementedError(
+                    "Unknown type of instruction %s" % type(insn).__name__)
+
 # }}}
 
 
@@ -113,6 +185,18 @@ def check_loop_priority_inames_known(kernel):
                 raise LoopyError("unknown iname '%s' in loop priorities" % iname)
 
 
+def _get_all_unique_iname_tags(kernel):
+    """Returns a set of all the iname tags used in *kernel* that
+    inherit from :class:`loopy.kernel.data.UniqueTag`.
+    """
+    from loopy.kernel.data import UniqueTag
+    iname_tags = [kernel.iname_to_tag.get(iname) for iname in
+        kernel.all_inames()]
+    return set(
+            tag for tag in iname_tags if
+            isinstance(tag, UniqueTag))
+
+
 def check_multiple_tags_allowed(kernel):
     from loopy.kernel.data import (GroupIndexTag, LocalIndexTag, VectorizeTag,
                 UnrollTag, ForceSequentialTag, IlpBaseTag, filter_iname_tags_by_type)
@@ -129,6 +213,7 @@ def check_multiple_tags_allowed(kernel):
 
 def check_for_double_use_of_hw_axes(kernel):
     from loopy.kernel.data import UniqueTag
+    from loopy.kernel.instruction import CallInstruction
 
     for insn in kernel.instructions:
         insn_tag_keys = set()
@@ -141,6 +226,21 @@ def check_for_double_use_of_hw_axes(kernel):
 
                 insn_tag_keys.add(key)
 
+        # check usage of iname tags in the callee kernel
+        if isinstance(insn, CallInstruction):
+            in_knl_callable = kernel.scoped_functions[
+                    insn.expression.function.name]
+            if isinstance(in_knl_callable, CallableKernel):
+                # check for collision in iname_tag keys in the instruction
+                # due to the callee kernel
+                common_iname_tags = [tag for tag in
+                        _get_all_unique_iname_tags(in_knl_callable.subkernel)
+                        if tag.key in insn_tag_keys]
+                if common_iname_tags:
+                    raise LoopyError("instruction '%s' has multiple "
+                            "inames tagged '%s'" % (insn.id,
+                                common_iname_tags.pop()))
+
 
 def check_for_inactive_iname_access(kernel):
     for insn in kernel.instructions:
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 11f874e1b..16fef45b5 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -32,6 +32,16 @@ from pytools.persistent_dict import WriteOncePersistentDict
 from loopy.tools import LoopyKeyBuilder
 from loopy.version import DATA_MODEL_VERSION
 
+from cgen import Collection
+from loopy.symbolic import CombineMapper
+
+from loopy.kernel.instruction import (
+        Assignment, NoOpInstruction, BarrierInstruction, CallInstruction,
+        CInstruction, _DataObliviousInstruction, MultiAssignmentBase)
+
+from functools import reduce
+
+
 import logging
 logger = logging.getLogger(__name__)
 
@@ -362,6 +372,32 @@ code_gen_cache = WriteOncePersistentDict(
          key_builder=LoopyKeyBuilder())
 
 
+class InKernelCallablesCollector(CombineMapper):
+    """
+    Returns an instance of :class:`frozenset` containing instances of
+    :class:`loopy.kernel.function_interface.InKernelCallable` in the
+    :attr:``kernel`.
+    """
+    def __init__(self, kernel):
+        self.kernel = kernel
+
+    def combine(self, values):
+        import operator
+        return reduce(operator.or_, values, frozenset())
+
+    def map_scoped_function(self, expr):
+        return frozenset([self.kernel.scoped_functions[
+            expr.name]])
+
+    def map_constant(self, expr):
+        return frozenset()
+
+    map_variable = map_constant
+    map_function_symbol = map_constant
+    map_tagged_variable = map_constant
+    map_type_cast = map_constant
+
+
 class PreambleInfo(ImmutableRecord):
     """
     .. attribute:: kernel
@@ -506,6 +542,24 @@ def generate_code_v2(kernel):
     for prea_gen in preamble_generators:
         preambles.extend(prea_gen(preamble_info))
 
+    # {{{ collect preambles from all the in kernel callables.
+
+    in_knl_callable_collector = InKernelCallablesCollector(kernel)
+
+    for insn in kernel.instructions:
+        if isinstance(insn, MultiAssignmentBase):
+            for in_knl_callable in in_knl_callable_collector(insn.expression):
+                preambles.extend(in_knl_callable.generate_preambles(kernel.target))
+
+        elif isinstance(insn, (CInstruction, _DataObliviousInstruction)):
+            pass
+        else:
+            raise NotImplementedError(
+                    "Unknown instruction type '%s'"
+                    % type(insn).__name__)
+
+    # }}}
+
     codegen_result = codegen_result.copy(device_preambles=preambles)
 
     # }}}
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 6b0033808..e89455d30 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -37,10 +37,6 @@ import re
 
 from pytools import UniqueNameGenerator, generate_unique_names
 
-from loopy.library.function import (
-        default_function_mangler,
-        single_arg_function_mangler)
-
 from loopy.diagnostic import CannotBranchDomainTree, LoopyError
 from loopy.tools import natsorted
 from loopy.diagnostic import StaticValueFindingError
@@ -186,6 +182,11 @@ class LoopKernel(ImmutableRecordWithoutPickling):
     .. attribute:: function_manglers
     .. attribute:: symbol_manglers
 
+    .. attribute:: function_scopers
+
+        A list of functions of signature ``(target, name)`` returning a
+        :class:`loopy.kernel.function_interface.InKernelCallable` or *None*.
+
     .. attribute:: substitutions
 
         a mapping from substitution names to
@@ -238,6 +239,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             iname_to_tags=None,
             substitutions=None,
             function_manglers=None,
+            function_scopers=None,
+            scoped_functions={},
             symbol_manglers=[],
 
             iname_slab_increments=None,
@@ -277,15 +280,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         if substitutions is None:
             substitutions = {}
         if function_manglers is None:
-            function_manglers = [
-                default_function_mangler,
-                single_arg_function_mangler,
-                ]
-        if symbol_manglers is None:
-            function_manglers = [
-                default_function_mangler,
-                single_arg_function_mangler,
-                ]
+            function_manglers = []
         if iname_slab_increments is None:
             iname_slab_increments = {}
 
@@ -348,6 +343,14 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         assert all(dom.get_ctx() == isl.DEFAULT_CONTEXT for dom in domains)
         assert assumptions.get_ctx() == isl.DEFAULT_CONTEXT
 
+        if function_scopers is None:
+            # populate the function scopers from the target and the loopy
+            # specific callable scopers
+
+            from loopy.library.function import loopy_specific_callable_scopers
+            function_scopers = [loopy_specific_callable_scopers] + (
+                    target.get_device_ast_builder().function_scopers())
+
         ImmutableRecordWithoutPickling.__init__(self,
                 domains=domains,
                 instructions=instructions,
@@ -367,6 +370,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 cache_manager=cache_manager,
                 applied_iname_rewrites=applied_iname_rewrites,
                 function_manglers=function_manglers,
+                function_scopers=function_scopers,
+                scoped_functions=scoped_functions,
                 symbol_manglers=symbol_manglers,
                 index_dtype=index_dtype,
                 options=options,
@@ -380,7 +385,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
     # }}}
 
-    # {{{ function mangling
+    # {{{ function mangling/scoping
 
     def mangle_function(self, identifier, arg_dtypes, ast_builder=None):
         if ast_builder is None:
@@ -423,6 +428,20 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         return None
 
+    def find_scoped_function_identifier(self, identifier):
+        """
+        Returns an instance of
+        :class:`loopy.kernel.function_interface.InKernelCallable` if the
+        :arg:`identifier` is known to any kernel function scoper, otherwise returns
+        *None*.
+        """
+        for scoper in self.function_scopers:
+            in_knl_callable = scoper(self.target, identifier)
+            if in_knl_callable:
+                return in_knl_callable
+
+        return None
+
     # }}}
 
     # {{{ symbol mangling
@@ -1505,7 +1524,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
             "preamble_generators",
             "function_manglers",
+            "function_scopers",
             "symbol_manglers",
+            "scoped_functions",
             )
 
     def update_persistent_hash(self, key_hash, key_builder):
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index c2b54cf8b..8b371b47d 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -24,16 +24,20 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-
 import numpy as np
 
 from pymbolic.mapper import CSECachingMapperMixin
+from pymbolic.primitives import Slice, Variable, Subscript
 from loopy.tools import intern_frozenset_of_ids
-from loopy.symbolic import IdentityMapper, WalkMapper
+from loopy.symbolic import (
+        IdentityMapper, WalkMapper, SubArrayRef,
+        RuleAwareIdentityMapper)
 from loopy.kernel.data import (
         InstructionBase,
         MultiAssignmentBase, Assignment,
-        SubstitutionRule)
+        SubstitutionRule, AddressSpace)
+from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction,
+        CallInstruction)
 from loopy.diagnostic import LoopyError, warn_with_kernel
 import islpy as isl
 from islpy import dim_type
@@ -1139,7 +1143,7 @@ class ArgumentGuesser:
     def make_new_arg(self, arg_name):
         arg_name = arg_name.strip()
 
-        from loopy.kernel.data import ValueArg, ArrayArg, AddressSpace
+        from loopy.kernel.data import ValueArg, ArrayArg
         import loopy as lp
 
         if arg_name in self.all_params:
@@ -1835,6 +1839,148 @@ def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True):
 # }}}
 
 
+# {{{ scope functions
+
+class FunctionScoper(RuleAwareIdentityMapper):
+    """
+    Mapper to convert the  ``function`` attribute of a
+    :class:`pymbolic.primitives.Call` known in the kernel as instances of
+    :class:`loopy.symbolic.ScopedFunction`. A function is known in the
+    *kernel*, :func:`loopy.kernel.LoopKernel.find_scoped_function_identifier`
+    returns an instance of
+    :class:`loopy.kernel.function_interface.InKernelCallable`.
+
+    **Example:** If given an expression of the form ``sin(x) + unknown_function(y) +
+    log(z)``, then the mapper would return ``ScopedFunction('sin')(x) +
+    unknown_function(y) + ScopedFunction('log')(z)``.
+
+    :arg rule_mapping_context: An instance of
+        :class:`loopy.symbolic.RuleMappingContext`.
+    :arg function_ids: A container with instances of :class:`str` indicating
+        the function identifiers to look for while scoping functions.
+    """
+    def __init__(self, rule_mapping_context, kernel):
+        super(FunctionScoper, self).__init__(rule_mapping_context)
+        self.kernel = kernel
+        self.scoped_functions = {}
+
+    def map_call(self, expr, expn_state):
+        from loopy.symbolic import ScopedFunction
+        if not isinstance(expr.function, ScopedFunction):
+
+            # search the kernel for the function
+            in_knl_callable = self.kernel.find_scoped_function_identifier(
+                    expr.function.name)
+            if in_knl_callable:
+                # associate the newly created ScopedFunction with the
+                # resolved in-kernel callable
+                self.scoped_functions[expr.function.name] = in_knl_callable
+
+                return type(expr)(
+                        ScopedFunction(expr.function.name),
+                        tuple(self.rec(child, expn_state)
+                            for child in expr.parameters))
+
+        # this is an unknown function as of yet, do not modify it
+        return super(FunctionScoper, self).map_call(expr, expn_state)
+
+    def map_call_with_kwargs(self, expr, expn_state):
+        # FIXME duplicated logic with map_call
+
+        from loopy.symbolic import ScopedFunction
+        if not isinstance(expr.function, ScopedFunction):
+
+            # search the kernel for the function.
+            in_knl_callable = self.kernel.find_scoped_function_identifier(
+                    expr.function.name)
+
+            if in_knl_callable:
+                # associate the newly created ScopedFunction with the
+                # resolved in-kernel callable
+                self.scoped_functions[expr.function.name] = in_knl_callable
+                return type(expr)(
+                        ScopedFunction(expr.function.name),
+                        tuple(self.rec(child, expn_state)
+                            for child in expr.parameters),
+                        dict(
+                            (key, self.rec(val, expn_state))
+                            for key, val in six.iteritems(expr.kw_parameters))
+                            )
+
+        # this is an unknown function as of yet, do not modify it
+        return super(FunctionScoper, self).map_call_with_kwargs(expr,
+                expn_state)
+
+    def map_reduction(self, expr, expn_state):
+        from loopy.library.reduction import (MaxReductionOperation,
+                MinReductionOperation, ArgMinReductionOperation,
+                ArgMaxReductionOperation, _SegmentedScalarReductionOperation,
+                SegmentedOp)
+        from loopy.library.reduction import ArgExtOp
+
+        # note down the extra functions arising due to certain reductions
+
+        # FIXME Discuss this. It cannot stay the way it is, because non-built-in
+        # reductions cannot add themselves to this list. We may need to change
+        # the reduction interface. Why don't reductions generate scoped functions
+        # in the first place?
+        if isinstance(expr.operation, MaxReductionOperation):
+            self.scoped_functions["max"] = (
+                    self.kernel.find_scoped_function_identifier("max"))
+        elif isinstance(expr.operation, MinReductionOperation):
+            self.scoped_functions["min"] = (
+                    self.kernel.find_scoped_function_identifier("min"))
+        elif isinstance(expr.operation, ArgMaxReductionOperation):
+            self.scoped_functions["max"] = (
+                    self.kernel.find_scoped_function_identifier("max"))
+            self.scoped_functions["make_tuple"] = (
+                    self.kernel.find_scoped_function_identifier("make_tuple"))
+            self.scoped_functions[ArgExtOp(expr.operation)] = (
+                    self.kernel.find_scoped_function_identifier(expr.operation))
+        elif isinstance(expr.operation, ArgMinReductionOperation):
+            self.scoped_functions["min"] = (
+                    self.kernel.find_scoped_function_identifier("min"))
+            self.scoped_functions["make_tuple"] = (
+                    self.kernel.find_scoped_function_identifier("make_tuple"))
+            self.scoped_functions[ArgExtOp(expr.operation)] = (
+                    self.kernel.find_scoped_function_identifier(expr.operation))
+        elif isinstance(expr.operation, _SegmentedScalarReductionOperation):
+            self.scoped_functions["make_tuple"] = (
+                    self.kernel.find_scoped_function_identifier("make_tuple"))
+            self.scoped_functions[SegmentedOp(expr.operation)] = (
+                    self.kernel.find_scoped_function_identifier(expr.operation))
+
+        return super(FunctionScoper, self).map_reduction(expr, expn_state)
+
+
+def scope_functions(kernel):
+    """
+    Returns a kernel with the pymbolic nodes involving known functions realized
+    as instances of :class:`loopy.symbolic.ScopedFunction`, along with the
+    resolved functions being added to the ``scoped_functions`` dictionary of
+    the kernel.
+    """
+
+    from loopy.symbolic import SubstitutionRuleMappingContext
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, kernel.get_var_name_generator())
+
+    function_scoper = FunctionScoper(rule_mapping_context, kernel)
+
+    # scoping fucntions and collecting the scoped functions
+    kernel_with_scoped_functions = rule_mapping_context.finish_kernel(
+            function_scoper.map_kernel(kernel))
+
+    # updating the functions collected during the scoped functions
+    updated_scoped_functions = kernel.scoped_functions.copy()
+    updated_scoped_functions.update(function_scoper.scoped_functions)
+
+    return kernel_with_scoped_functions.copy(
+            scoped_functions=updated_scoped_functions)
+
+# }}}
+
+
 # {{{ kernel creation top-level
 
 def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
@@ -2174,6 +2320,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     check_for_duplicate_names(knl)
     check_written_variable_names(knl)
 
+    knl = scope_functions(knl)
+
     from loopy.preprocess import prepare_for_caching
     knl = prepare_for_caching(knl)
 
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 95c3c336c..1d79a86d7 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -1877,7 +1877,15 @@ def infer_arg_is_output_only(kernel):
                 else:
                     new_args.append(arg.copy(is_output_only=False))
         elif isinstance(arg, ConstantArg):
+<<<<<<< HEAD
+            if arg.is_output_only:
+                raise LoopyError("Constant Argument %s cannot have "
+                        "is_output_only True" % arg.name)
+            else:
+                new_args.append(arg.copy(is_output_only=False))
+=======
             new_args.append(arg)
+>>>>>>> master
         else:
             raise NotImplementedError("Unkonwn argument type %s." % type(arg))
 
diff --git a/loopy/library/function.py b/loopy/library/function.py
index 9d557ac9f..4873eca91 100644
--- a/loopy/library/function.py
+++ b/loopy/library/function.py
@@ -22,6 +22,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from loopy.kernel.function_interface import ScalarCallable
+
 
 def default_function_mangler(kernel, name, arg_dtypes):
     from loopy.library.reduction import reduction_function_mangler
@@ -56,4 +58,41 @@ def tuple_function_mangler(kernel, name, arg_dtypes):
     return None
 
 
+class MakeTupleCallable(ScalarCallable):
+    def with_types(self, arg_id_to_dtype, kernel):
+        new_arg_id_to_dtype = arg_id_to_dtype.copy()
+        for i in range(len(arg_id_to_dtype)):
+            if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None:
+                new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i]
+
+        return self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
+            name_in_target="loopy_make_tuple")
+
+    def with_descrs(self, arg_id_to_descr):
+        from loopy.kernel.function_interface import ValueArgDescriptor
+        new_arg_id_to_descr = dict(((id, ValueArgDescriptor()),
+            (-id-1, ValueArgDescriptor())) for id in arg_id_to_descr.keys())
+
+        return self.copy(arg_id_to_descr=new_arg_id_to_descr)
+
+
+class IndexOfCallable(ScalarCallable):
+    def with_types(self, arg_id_to_dtype, kernel):
+        new_arg_id_to_dtype = arg_id_to_dtype.copy()
+        new_arg_id_to_dtype[-1] = kernel.index_dtype
+
+        return self.copy(arg_id_to_dtype=new_arg_id_to_dtype)
+
+
+def loopy_specific_callable_scopers(target, identifier):
+    if identifier == "make_tuple":
+        return MakeTupleCallable(name="make_tuple")
+
+    if identifier in ["indexof", "indexof_vec"]:
+        return IndexOfCallable(name=identifier)
+
+    from loopy.library.reduction import reduction_scoper
+    return reduction_scoper(target, identifier)
+
+
 # vim: foldmethod=marker
diff --git a/loopy/library/random123.py b/loopy/library/random123.py
index b8633114d..a2880bfb8 100644
--- a/loopy/library/random123.py
+++ b/loopy/library/random123.py
@@ -27,6 +27,7 @@ THE SOFTWARE.
 
 from pytools import ImmutableRecord
 from mako.template import Template
+from loopy.kernel.function_interface import ScalarCallable
 import numpy as np
 
 
@@ -163,60 +164,73 @@ double${ width } ${ name }_f64(
 # }}}
 
 
-def random123_preamble_generator(preamble_info):
-    for f in preamble_info.seen_functions:
-        try:
-            rng_variant = FUNC_NAMES_TO_RNG[f.name]
-        except KeyError:
-            continue
+class Random123Callable(ScalarCallable):
+    """
+    Records information about for the random123 functions.
+    """
+
+    def with_types(self, arg_id_to_dtype, kernel):
+
+        if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or (
+                arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None):
+            # the types provided aren't mature enough to specialize the
+            # callable
+            return self.copy(arg_id_to_dtype=arg_id_to_dtype)
+
+        name = self.name
+        target = kernel.target
+
+        rng_variant = FUNC_NAMES_TO_RNG[name]
+
+        from loopy.types import NumpyType
+        base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits]
+        ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width)
+        key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width)
+
+        fn = rng_variant.full_name
+        if name == fn:
+            new_arg_id_to_dtype = {-1: ctr_dtype, -2: ctr_dtype, 0: ctr_dtype, 1:
+                    key_dtype}
+            return self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
+                    name_in_target=fn+"_gen")
+
+        elif name == fn + "_f32":
+            new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float32),
+                rng_variant.width),
+                    -2: ctr_dtype, 0: ctr_dtype, 1:
+                    key_dtype}
+            return self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
+                    name_in_target=name)
+
+        elif name == fn + "_f64":
+            new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float64),
+                rng_variant.width),
+                    -2: ctr_dtype, 0: ctr_dtype, 1:
+                    key_dtype}
+            return self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
+                    name_in_target=name)
+
+        return self.copy(arg_id_to_dtype=arg_id_to_dtype)
+
+    def generate_preambles(self, target):
+        rng_variant = FUNC_NAMES_TO_RNG[self.name]
 
         from loopy.target.pyopencl import PyOpenCLTarget
         yield ("90-random123-"+rng_variant.full_name,
                 PREAMBLE_TEMPLATE.render(
                     is_pyopencl_target=isinstance(
-                        preamble_info.kernel.target,
+                        target,
                         PyOpenCLTarget),
                     rng_variant=rng_variant,
                     ))
 
+        return
 
-def random123_function_mangler(kernel, name, arg_dtypes):
-    try:
-        rng_variant = FUNC_NAMES_TO_RNG[name]
-    except KeyError:
-        return None
-
-    from loopy.types import NumpyType
-    target = kernel.target
-    base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits]
-    ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width)
-    key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width)
-
-    from loopy.kernel.data import CallMangleInfo
-    fn = rng_variant.full_name
-    if name == fn:
-        return CallMangleInfo(
-                target_name=fn+"_gen",
-                result_dtypes=(ctr_dtype, ctr_dtype),
-                arg_dtypes=(ctr_dtype, key_dtype))
-
-    elif name == fn + "_f32":
-        return CallMangleInfo(
-                target_name=name,
-                result_dtypes=(
-                    target.vector_dtype(NumpyType(np.float32), rng_variant.width),
-                    ctr_dtype),
-                arg_dtypes=(ctr_dtype, key_dtype))
-
-    elif name == fn + "_f64":
-        return CallMangleInfo(
-                target_name=name,
-                result_dtypes=(
-                    target.vector_dtype(NumpyType(np.float64), rng_variant.width),
-                    ctr_dtype),
-                arg_dtypes=(ctr_dtype, key_dtype))
-
-    else:
-        return None
+
+def random123_function_scoper(target, identifier):
+    if identifier in FUNC_NAMES_TO_RNG:
+        return Random123Callable(name=identifier)
+
+    return None
 
 # vim: foldmethod=marker
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index 8ed5cbe56..ca2f02347 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -1,4 +1,4 @@
-from __future__ import division
+from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -24,6 +24,8 @@ THE SOFTWARE.
 
 
 from pymbolic import var
+from loopy.symbolic import ScopedFunction
+from loopy.kernel.function_interface import ScalarCallable
 import numpy as np
 
 from loopy.symbolic import FunctionIdentifier
@@ -180,7 +182,7 @@ class MaxReductionOperation(ScalarReductionOperation):
         return get_ge_neutral(dtype)
 
     def __call__(self, dtype, operand1, operand2):
-        return var("max")(operand1, operand2)
+        return ScopedFunction("max")(operand1, operand2)
 
 
 class MinReductionOperation(ScalarReductionOperation):
@@ -188,7 +190,7 @@ class MinReductionOperation(ScalarReductionOperation):
         return get_le_neutral(dtype)
 
     def __call__(self, dtype, operand1, operand2):
-        return var("min")(operand1, operand2)
+        return ScopedFunction("min")(operand1, operand2)
 
 
 # {{{ base class for symbolic reduction ops
@@ -237,7 +239,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation):
 
     def neutral_element(self, scalar_dtype, segment_flag_dtype):
         scalar_neutral_element = self.inner_reduction.neutral_element(scalar_dtype)
-        return var("make_tuple")(scalar_neutral_element,
+        return ScopedFunction("make_tuple")(scalar_neutral_element,
                 segment_flag_dtype.numpy_dtype.type(0))
 
     def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype):
@@ -254,7 +256,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation):
         return type(self) == type(other)
 
     def __call__(self, dtypes, operand1, operand2):
-        return SegmentedOp(self)(*(operand1 + operand2))
+        return ScopedFunction(SegmentedOp(self))(*(operand1 + operand2))
 
 
 class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation):
@@ -268,29 +270,6 @@ class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation):
     op = "((%s) * (%s))"
     which = "product"
 
-
-def get_segmented_function_preamble(kernel, func_id, arg_dtypes):
-    op = func_id.reduction_op
-    scalar_dtype = arg_dtypes[0]
-    segment_flag_dtype = arg_dtypes[1]
-    prefix = op.prefix(scalar_dtype, segment_flag_dtype)
-
-    return (prefix, """
-    inline %(scalar_t)s %(prefix)s_op(
-        %(scalar_t)s op1, %(segment_flag_t)s segment_flag1,
-        %(scalar_t)s op2, %(segment_flag_t)s segment_flag2,
-        %(segment_flag_t)s *segment_flag_out)
-    {
-        *segment_flag_out = segment_flag1 | segment_flag2;
-        return segment_flag2 ? op2 : %(combined)s;
-    }
-    """ % dict(
-            scalar_t=kernel.target.dtype_to_typename(scalar_dtype),
-            prefix=prefix,
-            segment_flag_t=kernel.target.dtype_to_typename(segment_flag_dtype),
-            combined=op.op % ("op1", "op2"),
-            ))
-
 # }}}
 
 
@@ -313,7 +292,7 @@ class _ArgExtremumReductionOperation(ReductionOperation):
         scalar_neutral_func = (
                 get_ge_neutral if self.neutral_sign < 0 else get_le_neutral)
         scalar_neutral_element = scalar_neutral_func(scalar_dtype)
-        return var("make_tuple")(scalar_neutral_element,
+        return ScopedFunction("make_tuple")(scalar_neutral_element,
                 index_dtype.numpy_dtype.type(-1))
 
     def __str__(self):
@@ -330,7 +309,7 @@ class _ArgExtremumReductionOperation(ReductionOperation):
         return 2
 
     def __call__(self, dtypes, operand1, operand2):
-        return ArgExtOp(self)(*(operand1 + operand2))
+        return ScopedFunction(ArgExtOp(self))(*(operand1 + operand2))
 
 
 class ArgMaxReductionOperation(_ArgExtremumReductionOperation):
@@ -344,38 +323,6 @@ class ArgMinReductionOperation(_ArgExtremumReductionOperation):
     update_comparison = "<="
     neutral_sign = +1
 
-
-def get_argext_preamble(kernel, func_id, arg_dtypes):
-    op = func_id.reduction_op
-    scalar_dtype = arg_dtypes[0]
-    index_dtype = arg_dtypes[1]
-
-    prefix = op.prefix(scalar_dtype, index_dtype)
-
-    return (prefix, """
-    inline %(scalar_t)s %(prefix)s_op(
-        %(scalar_t)s op1, %(index_t)s index1,
-        %(scalar_t)s op2, %(index_t)s index2,
-        %(index_t)s *index_out)
-    {
-        if (op2 %(comp)s op1)
-        {
-            *index_out = index2;
-            return op2;
-        }
-        else
-        {
-            *index_out = index1;
-            return op1;
-        }
-    }
-    """ % dict(
-            scalar_t=kernel.target.dtype_to_typename(scalar_dtype),
-            prefix=prefix,
-            index_t=kernel.target.dtype_to_typename(index_dtype),
-            comp=op.update_comparison,
-            ))
-
 # }}}
 
 
@@ -429,70 +376,91 @@ def parse_reduction_op(name):
 # }}}
 
 
-def reduction_function_mangler(kernel, func_id, arg_dtypes):
-    if isinstance(func_id, ArgExtOp):
-        from loopy.target.opencl import CTarget
-        if not isinstance(kernel.target, CTarget):
-            raise LoopyError("%s: only C-like targets supported for now" % func_id)
-
-        op = func_id.reduction_op
-        scalar_dtype = arg_dtypes[0]
-        index_dtype = arg_dtypes[1]
-
-        from loopy.kernel.data import CallMangleInfo
-        return CallMangleInfo(
-                target_name="%s_op" % op.prefix(
-                    scalar_dtype, index_dtype),
-                result_dtypes=op.result_dtypes(
-                    kernel, scalar_dtype, index_dtype),
-                arg_dtypes=(
-                    scalar_dtype,
-                    index_dtype,
-                    scalar_dtype,
-                    index_dtype),
-                )
-
-    elif isinstance(func_id, SegmentedOp):
-        from loopy.target.opencl import CTarget
-        if not isinstance(kernel.target, CTarget):
-            raise LoopyError("%s: only C-like targets supported for now" % func_id)
-
-        op = func_id.reduction_op
-        scalar_dtype = arg_dtypes[0]
-        segment_flag_dtype = arg_dtypes[1]
-
-        from loopy.kernel.data import CallMangleInfo
-        return CallMangleInfo(
-                target_name="%s_op" % op.prefix(
-                    scalar_dtype, segment_flag_dtype),
-                result_dtypes=op.result_dtypes(
-                    kernel, scalar_dtype, segment_flag_dtype),
-                arg_dtypes=(
-                    scalar_dtype,
-                    segment_flag_dtype,
-                    scalar_dtype,
-                    segment_flag_dtype),
-                )
+# {{{ reduction specific callables
+
+class ReductionCallable(ScalarCallable):
+    def with_types(self, arg_id_to_dtype, kernel):
+        scalar_dtype = arg_id_to_dtype[0]
+        index_dtype = arg_id_to_dtype[1]
+        result_dtypes = self.name.result_dtypes(kernel, scalar_dtype,
+                index_dtype)
+        new_arg_id_to_dtype = arg_id_to_dtype.copy()
+        new_arg_id_to_dtype[-1] = result_dtypes[0]
+        new_arg_id_to_dtype[-2] = result_dtypes[1]
+        name_in_target = self.name.prefix(scalar_dtype, index_dtype) + "_op"
+
+        return self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
+                name_in_target=name_in_target)
+
+    def with_descr(self, arg_id_to_descr):
+        from loopy.library.kernel.function_interface import ValueArgDescriptor
+        new_arg_id_to_descr = arg_id_to_descr.copy()
+        new_arg_id_to_descr[-1] = ValueArgDescriptor()
+        return self.copy(arg_id_to_descr=arg_id_to_descr)
+
+    def generate_preambles(self, target):
+        if isinstance(self.name, _ArgExtremumReductionOperation):
+            op = self.name
+            scalar_dtype = self.arg_id_to_dtype[-1]
+            index_dtype = self.arg_id_to_dtype[-2]
+
+            prefix = op.prefix(scalar_dtype, index_dtype)
+
+            yield (prefix, """
+            inline %(scalar_t)s %(prefix)s_op(
+                %(scalar_t)s op1, %(index_t)s index1,
+                %(scalar_t)s op2, %(index_t)s index2,
+                %(index_t)s *index_out)
+            {
+                if (op2 %(comp)s op1)
+                {
+                    *index_out = index2;
+                    return op2;
+                }
+                else
+                {
+                    *index_out = index1;
+                    return op1;
+                }
+            }
+            """ % dict(
+                    scalar_t=target.dtype_to_typename(scalar_dtype),
+                    prefix=prefix,
+                    index_t=target.dtype_to_typename(index_dtype),
+                    comp=op.update_comparison,
+                    ))
+        elif isinstance(self.name, _SegmentedScalarReductionOperation):
+            op = self.name
+            scalar_dtype = self.arg_id_to_dtype[-1]
+            segment_flag_dtype = self.arg_id_to_dtype[-2]
+            prefix = op.prefix(scalar_dtype, segment_flag_dtype)
+
+            yield (prefix, """
+            inline %(scalar_t)s %(prefix)s_op(
+                %(scalar_t)s op1, %(segment_flag_t)s segment_flag1,
+                %(scalar_t)s op2, %(segment_flag_t)s segment_flag2,
+                %(segment_flag_t)s *segment_flag_out)
+            {
+                *segment_flag_out = segment_flag1 | segment_flag2;
+                return segment_flag2 ? op2 : %(combined)s;
+            }
+            """ % dict(
+                    scalar_t=target.dtype_to_typename(scalar_dtype),
+                    prefix=prefix,
+                    segment_flag_t=target.dtype_to_typename(segment_flag_dtype),
+                    combined=op.op % ("op1", "op2"),
+                    ))
+
+        return
+
+
+def reduction_scoper(target, identifier):
+    if isinstance(identifier, (_ArgExtremumReductionOperation,
+            _SegmentedScalarReductionOperation)):
+        return ReductionCallable(name=identifier)
 
     return None
 
-
-def reduction_preamble_generator(preamble_info):
-    from loopy.target.opencl import OpenCLTarget
-
-    for func in preamble_info.seen_functions:
-        if isinstance(func.name, ArgExtOp):
-            if not isinstance(preamble_info.kernel.target, OpenCLTarget):
-                raise LoopyError("only OpenCL supported for now")
-
-            yield get_argext_preamble(preamble_info.kernel, func.name,
-                    func.arg_dtypes)
-
-        elif isinstance(func.name, SegmentedOp):
-            if not isinstance(preamble_info.kernel.target, OpenCLTarget):
-                raise LoopyError("only OpenCL supported for now")
-
-            yield get_segmented_function_preamble(preamble_info.kernel, func.name,
-                    func.arg_dtypes)
+# }}}
 
 # vim: fdm=marker
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index fc950c78e..6beadb3de 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -27,6 +27,7 @@ import six
 from loopy.diagnostic import (
         LoopyError, WriteRaceConditionWarning, warn_with_kernel,
         LoopyAdvisory)
+from functools import reduce
 
 import islpy as isl
 
@@ -37,6 +38,10 @@ from loopy.version import DATA_MODEL_VERSION
 from loopy.kernel.data import make_assignment, filter_iname_tags_by_type
 # for the benefit of loopy.statistics, for now
 from loopy.type_inference import infer_unknown_types
+from loopy.symbolic import CombineMapper
+
+from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction,
+        CallInstruction,  _DataObliviousInstruction)
 
 import logging
 logger = logging.getLogger(__name__)
@@ -2108,6 +2113,350 @@ def check_atomic_loads(kernel):
 # }}}
 
 
+# {{{ arg_descr_inference
+
+class ArgDescrInferenceMapper(CombineMapper):
+    """
+    Returns a set of instances of :class:`tuple` (expr,
+    in_kernel_callable). The mapped `in_kernel_callable` of the
+    :class:`InKernelCallable` are descriptor specialized for the given
+    arguments.
+    """
+
+    def __init__(self, kernel):
+        self.kernel = kernel
+
+    def combine(self, values):
+        import operator
+        return reduce(operator.or_, values, frozenset())
+
+    # FIXME logic duplication between map_call and map_call_with_kwargs
+    def map_call(self, expr, **kwargs):
+        from loopy.kernel.function_interface import ValueArgDescriptor
+        from loopy.symbolic import SubArrayRef, ScopedFunction
+
+        # ignoring if the call is not to a ScopedFunction
+        if not isinstance(expr.function, ScopedFunction):
+            return self.combine((self.rec(child) for child in expr.parameters))
+
+        # descriptors for the args
+        arg_id_to_descr = dict((i, par.get_array_arg_descriptor(self.kernel))
+                if isinstance(par, SubArrayRef) else (i, ValueArgDescriptor())
+                for i, par in enumerate(expr.parameters))
+
+        assignee_id_to_descr = {}
+
+        # assignee descriptor
+        if 'assignees' in kwargs:
+            # If supplied with assignees then this is a CallInstruction
+            assignees = kwargs['assignees']
+            assert isinstance(assignees, tuple)
+            for i, par in enumerate(assignees):
+                if isinstance(par, SubArrayRef):
+                    assignee_id_to_descr[-i-1] = (
+                            par.get_array_arg_descriptor(self.kernel))
+                else:
+                    assignee_id_to_descr[-i-1] = ValueArgDescriptor()
+
+        # gathering all the descriptors
+        # TODO: I dont like in place updates. Change this to somthing else.
+        # Perhaps make a function?
+        combined_arg_id_to_descr = arg_id_to_descr.copy()
+        combined_arg_id_to_descr.update(assignee_id_to_descr)
+
+        # specializing the function according to the parameter description
+        new_scoped_function = (
+                self.kernel.scoped_functions[expr.function.name].with_descrs(
+                    combined_arg_id_to_descr))
+
+        # collecting the descriptors for args, kwargs, assignees
+        return (frozenset(((expr, new_scoped_function), )) |
+                self.combine((self.rec(child) for child in expr.parameters)))
+
+    def map_call_with_kwargs(self, expr, **kwargs):
+        from loopy.kernel.function_interface import ValueArgDescriptor
+        from loopy.symbolic import SubArrayRef
+
+        # descriptors for the args and kwargs:
+        arg_id_to_descr = dict((i, par.get_array_arg_descriptor(self.kernel))
+                if isinstance(par, SubArrayRef) else ValueArgDescriptor()
+                for i, par in tuple(enumerate(expr.parameters)) +
+                tuple(expr.kw_parameters.items()))
+
+        assignee_id_to_descr = {}
+
+        if 'assignees' in kwargs:
+            # If supplied with assignees then this is a CallInstruction
+            assignees = kwargs['assignees']
+            assert isinstance(assignees, tuple)
+            for i, par in enumerate(assignees):
+                if isinstance(par, SubArrayRef):
+                    assignee_id_to_descr[-i-1] = (
+                            par.get_array_arg_descriptor(self.kernel))
+                else:
+                    assignee_id_to_descr[-i-1] = ValueArgDescriptor()
+
+        # gathering all the descriptors
+        # TODO: I dont like in place updates. Change this to somthing else.
+        # Perhaps make a function?
+        combined_arg_id_to_descr = arg_id_to_descr.copy()
+        combined_arg_id_to_descr.update(assignee_id_to_descr)
+
+        # specializing the function according to the parameter description
+        new_scoped_function = (
+                self.kernel.scoped_functions[expr.function.name].with_descrs(
+                    combined_arg_id_to_descr))
+
+        # collecting the descriptors for args, kwargs, assignees
+        return (
+                frozenset(((expr, new_scoped_function), )) |
+                self.combine((self.rec(child) for child in expr.parameters)))
+
+    def map_constant(self, expr, **kwargs):
+        return frozenset()
+
+    map_variable = map_constant
+    map_function_symbol = map_constant
+    map_tagged_variable = map_constant
+    map_type_cast = map_constant
+
+
+def infer_arg_descr(kernel):
+    """
+    Returns a copy of *kernel* with the argument shapes and strides matching for
+    scoped functions in the *kernel*. Refer
+    :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`.
+    """
+
+    arg_description_modifier = ArgDescrInferenceMapper(kernel)
+    pymbolic_calls_to_functions = set()
+
+    for insn in kernel.instructions:
+
+        if isinstance(insn, CallInstruction):
+            # In call instructions the assignees play an important in
+            # determining the arg_id_to_dtype
+            pymbolic_calls_to_functions.update(
+                    arg_description_modifier(insn.expression,
+                        assignees=insn.assignees))
+        elif isinstance(insn, MultiAssignmentBase):
+            pymbolic_calls_to_functions.update(arg_description_modifier(
+                insn.expression))
+        elif isinstance(insn, (_DataObliviousInstruction, CInstruction)):
+            pass
+        else:
+            raise NotImplementedError("arg_descr_inference for %s instruction" %
+                    type(insn))
+
+    # making it the set of tuples a dict
+    pymbolic_calls_to_functions = dict(pymbolic_calls_to_functions)
+
+    # Now do the similar treatment as done for type inference.
+    from loopy.kernel.function_interface import (
+            register_pymbolic_calls_to_knl_callables)
+
+    return register_pymbolic_calls_to_knl_callables(kernel,
+            pymbolic_calls_to_functions)
+
+# }}}
+
+
+# {{{
+
+class HWAxesInferenceMapper(CombineMapper):
+    """
+    Returns a set of instances of :class:`tuple` (expr,
+    in_kernel_callable). The mapped `in_kernel_callable` of the
+    :class:`InKernelCallable` are specialized for the the grid sizes of
+    :attr:`kernel`.
+    """
+
+    def __init__(self, kernel):
+        self.kernel = kernel
+        self.local_size, self.global_size = kernel.get_grid_size_upper_bounds()
+
+    def combine(self, values):
+        import operator
+        return reduce(operator.or_, values, frozenset())
+
+    def map_call(self, expr, **kwargs):
+        # ignoring if the call is not to a ScopedFunction
+        from loopy.symbolic import ScopedFunction
+        if not isinstance(expr.function, ScopedFunction):
+            return self.combine((self.rec(child) for child in expr.parameters))
+
+        new_scoped_function = (
+                self.kernel.scoped_functions[expr.function.name].with_hw_axes_sizes(
+                    self.local_size, self.global_size))
+
+        return (frozenset(((expr, new_scoped_function), )) |
+                self.combine((self.rec(child) for child in expr.parameters)))
+
+    def map_call_with_kwargs(self, expr, **kwargs):
+        from loopy.symbolic import ScopedFunction
+        # ignoring if the call is not to a ScopedFunction
+        if not isinstance(expr.function, ScopedFunction):
+            return self.combine((self.rec(child) for child in expr.parameters))
+
+        new_scoped_function = (
+                self.kernel.scoped_functions[expr.function.name].with_hw_axes_sizes(
+                    self.local_size, self.global_size))
+
+        return (frozenset(((expr, new_scoped_function), )) |
+                self.combine((self.rec(child) for child in
+                    expr.parameters+tuple(expr.kw_parameters.values()))))
+
+    def map_constant(self, expr, **kwargs):
+        return frozenset()
+
+    map_variable = map_constant
+    map_function_symbol = map_constant
+    map_tagged_variable = map_constant
+    map_type_cast = map_constant
+
+
+def infer_hw_axes_sizes(kernel):
+    """
+    Returns a copy of *kernel* with the hardware axes matching for
+    scoped functions in the *kernel*. Refer
+    :meth:`loopy.kernel.function_interface.InKernelCallable.with_hw_axes_sizes`.
+    """
+    hw_axes_modifier = HWAxesInferenceMapper(kernel)
+    pymbolic_calls_to_functions = set()
+
+    for insn in kernel.instructions:
+        if isinstance(insn, MultiAssignmentBase):
+            pymbolic_calls_to_functions.update(hw_axes_modifier(
+                insn.expression))
+        elif isinstance(insn, (_DataObliviousInstruction, CInstruction)):
+            pass
+        else:
+            raise NotImplementedError("unknown type of instruction %s." %
+                    type(insn))
+
+    # making it the set of tuples a dict
+    pymbolic_calls_to_functions = dict(pymbolic_calls_to_functions)
+
+    # Now do the similar treatment as done for type inference.
+    from loopy.kernel.function_interface import (
+            register_pymbolic_calls_to_knl_callables)
+
+    return register_pymbolic_calls_to_knl_callables(kernel,
+            pymbolic_calls_to_functions)
+
+# }}}
+
+
+# {{{ catching functions that are not ready for codegen
+
+class FunctionsNotReadyForCodegenCollector(CombineMapper):
+    """
+    Returns all instances of function calls in an expression which are
+    not ready for code generation.
+    """
+    def __init__(self, kernel):
+        self.kernel = kernel
+
+    def combine(self, values):
+        return all(values)
+
+    # FIXME logic duplication between map_call and map_call_with_kwargs
+    def map_call(self, expr, *args, **kwargs):
+        from loopy.library.reduction import ArgExtOp, SegmentedOp
+        from pymbolic.primitives import Variable
+        from loopy.symbolic import ScopedFunction
+
+        if isinstance(expr.function, (ArgExtOp, SegmentedOp)):
+            return self.combine(
+                    tuple(
+                        self.rec(child, *args, **kwargs) for child in
+                        expr.parameters))
+        elif isinstance(expr.function, Variable):
+            # UnScopedFunction obtained and hence clearly not ready for
+            # codegen.
+            return False
+
+        elif isinstance(expr.function, ScopedFunction):
+            is_ready_for_codegen = self.kernel.scoped_functions[
+                    expr.function.name].is_ready_for_codegen()
+            return self.combine(
+                    (is_ready_for_codegen,) +
+                    tuple(
+                        self.rec(child, *args, **kwargs)
+                        for child in expr.parameters))
+        else:
+            raise LoopyError("Unexpected function type %s obtained in %s"
+                    % (type(expr.function), expr))
+
+    def map_call_with_kwargs(self, expr, *args, **kwargs):
+        is_ready_for_codegen = self.kernel.scoped_functions[
+                expr.function.name].is_ready_for_codegen()
+        return self.combine(
+                (is_ready_for_codegen,)
+                + tuple(
+                    self.rec(child, *args, **kwargs)
+                    for child in expr.parameters)
+                + tuple(
+                    self.rec(child, *args, **kwargs)
+                    for child in expr.kw_parameters.values())
+                )
+
+    def map_constant(self, expr):
+        return True
+
+    map_variable = map_constant
+    map_function_symbol = map_constant
+    map_tagged_variable = map_constant
+    map_type_cast = map_constant
+
+
+def make_functions_ready_for_codegen(kernel):
+    """
+    Specializes the functions in the kernel that are missed during type
+    inference.
+
+    .. code:: python
+
+        knl = lp.make_kernel(
+            "{[i]: 0<=i<16}",
+            "a[i] = sin(b[i])",
+            [lp.ArrayArg('a', dtype=np.float64),
+            lp.ArrayArg('b', dtype=np.float64)])
+
+    In the above case, none of the instructions undergo type-specialization, as
+    all the arguments' types have been realized. But, this would be a problem
+    during the code generation phase as ``sin`` did not undergo type
+    specialization, and hence must be fixed through this function.
+    """
+    from loopy.type_inference import TypeInferenceMapper
+    from loopy.symbolic import SubstitutionRuleExpander
+    from loopy.kernel.function_interface import (
+            register_pymbolic_calls_to_knl_callables)
+
+    unready_functions_collector = FunctionsNotReadyForCodegenCollector(kernel)
+    subst_expander = SubstitutionRuleExpander(kernel.substitutions)
+    type_inf_mapper = TypeInferenceMapper(kernel)
+
+    for insn in kernel.instructions:
+        if isinstance(insn, MultiAssignmentBase):
+            expr = subst_expander(insn.expression)
+            if not unready_functions_collector(expr):
+                # Infer the type of the functions that are not type specialized.
+                type_inf_mapper(expr, return_tuple=isinstance(insn,
+                    CallInstruction), return_dtype_set=True)
+
+        elif isinstance(insn, (_DataObliviousInstruction, CInstruction)):
+            pass
+
+        else:
+            NotImplementedError("Unknown Instruction")
+
+    return register_pymbolic_calls_to_knl_callables(kernel,
+            type_inf_mapper.specialized_functions)
+
+# }}}
+
+
 preprocess_cache = WriteOncePersistentDict(
         "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION,
         key_builder=LoopyKeyBuilder())
@@ -2188,6 +2537,16 @@ def preprocess_kernel(kernel, device=None):
 
     kernel = find_temporary_address_space(kernel)
 
+    # inferring the shape and dim_tags of the arguments involved in a function
+    # call.
+    kernel = infer_arg_descr(kernel)
+
+    # type specialize functions that were missed during the type inference.
+    kernel = make_functions_ready_for_codegen(kernel)
+
+    # tuning the functions in the kernel to align with the grid sizes.
+    kernel = infer_hw_axes_sizes(kernel)
+
     # boostability should be removed in 2017.x.
     kernel = find_idempotence(kernel)
     kernel = limit_boostability(kernel)
diff --git a/loopy/statistics.py b/loopy/statistics.py
index cee28b24f..6c012ca21 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -712,9 +712,16 @@ class ExpressionOpCounter(CounterBase):
     map_variable = map_constant
 
     def map_call(self, expr):
+        from loopy.symbolic import ScopedFunction
+        if isinstance(expr.function, ScopedFunction):
+            function_identifier = self.knl.scoped_functions[
+                    expr.function.name].name
+        else:
+            function_identifier = expr.function.name
+
         return ToCountMap(
                     {Op(dtype=self.type_inf(expr),
-                        name='func:'+str(expr.function),
+                        name='func:'+function_identifier,
                         count_granularity=CountGranularity.WORKITEM): 1}
                     ) + self.rec(expr.parameters)
 
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 8927cd6fb..770e1128a 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -56,6 +56,7 @@ from pymbolic.mapper.constant_folder import \
         ConstantFoldingMapper as ConstantFoldingMapperBase
 
 from pymbolic.parser import Parser as ParserBase
+from loopy.diagnostic import LoopyError
 
 from loopy.diagnostic import ExpressionToAffineConversionError
 
@@ -106,7 +107,10 @@ class IdentityMapperMixin(object):
         return expr
 
     def map_type_annotation(self, expr, *args):
-        return type(expr)(expr.type, self.rec(expr.child))
+        return type(expr)(expr.type, self.rec(expr.child, *args))
+
+    def map_scoped_function(self, expr, *args):
+        return ScopedFunction(self.rec(expr.function, *args))
 
     map_type_cast = map_type_annotation
 
@@ -165,9 +169,16 @@ class WalkMapper(WalkMapperBase):
 
     map_rule_argument = map_group_hw_index
 
+    def map_scoped_function(self, expr, *args):
+        if not self.visit(expr):
+            return
+
+        self.rec(expr.function, *args)
+
 
 class CallbackMapper(CallbackMapperBase, IdentityMapper):
     map_reduction = CallbackMapperBase.map_constant
+    map_scoped_function = CallbackMapperBase.map_constant
 
 
 class CombineMapper(CombineMapperBase):
@@ -232,13 +243,16 @@ class StringifyMapper(StringifyMapperBase):
         from pymbolic.mapper.stringifier import PREC_NONE
         return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE))
 
+    def map_scoped_function(self, expr, prec):
+        return "ScopedFunction('%s')" % expr.name
+
 
 class UnidirectionalUnifier(UnidirectionalUnifierBase):
     def map_reduction(self, expr, other, unis):
         if not isinstance(other, type(expr)):
             return self.treat_mismatch(expr, other, unis)
         if (expr.inames != other.inames
-                or type(expr.operation) != type(other.operation)  # noqa
+                or type(expr.function) != type(other.function)  # noqa
                 ):
             return []
 
@@ -274,6 +288,13 @@ class DependencyMapper(DependencyMapperBase):
         return self.combine(
                 self.rec(child, *args) for child in expr.parameters)
 
+    def map_call_with_kwargs(self, expr, *args):
+        # Loopy does not have first-class functions. Do not descend
+        # into 'function' attribute of Call.
+        return self.combine(
+                self.rec(child, *args) for child in expr.parameters+tuple(
+                    expr.kw_parameters.values()))
+
     def map_reduction(self, expr):
         deps = self.rec(expr.expr)
         return deps - set(p.Variable(iname) for iname in expr.inames)
@@ -289,6 +310,9 @@ class DependencyMapper(DependencyMapperBase):
     def map_type_cast(self, expr):
         return self.rec(expr.child)
 
+    def map_scoped_function(self, expr):
+        return self.rec(expr.function)
+
 
 class SubstitutionRuleExpander(IdentityMapper):
     def __init__(self, rules):
@@ -638,6 +662,51 @@ class RuleArgument(p.Expression):
 
     mapper_method = intern("map_rule_argument")
 
+
+class ScopedFunction(p.Expression):
+    """
+    A function invocation whose definition is known in a :mod:`loopy` kernel.
+    Each instance of :class:`loopy.symbolic.ScopedFunction` in an expression
+    points to an instance of
+    :class:`loopy.kernel.function_interface.InKernelCallable` through the
+    mapping :attr:`loopy.kernel.LoopKernel.scoped_functions`. Refer
+    :ref:`ref_scoped_function` for a slightly detailed explanation on scoped
+    functions.
+
+    .. attribute:: function
+
+        An instance of :class:`pymbolic.primitives.Variable`,
+        :class:`loopy.library.reduction.ArgExtOp` or
+        :class:`loopy.library.reduction.SegmentedOp`.
+    """
+    init_arg_names = ("function", )
+
+    def __init__(self, function):
+        if isinstance(function, str):
+            function = p.Variable(function)
+        from loopy.library.reduction import ArgExtOp, SegmentedOp
+        assert isinstance(function, (p.Variable, ArgExtOp, SegmentedOp))
+        self.function = function
+
+    @property
+    def name(self):
+        from loopy.library.reduction import ArgExtOp, SegmentedOp
+        if isinstance(self.function, p.Variable):
+            return self.function.name
+        elif isinstance(self.function, (ArgExtOp, SegmentedOp)):
+            return self.function
+        else:
+            raise LoopyError("Unexpected function type %s in ScopedFunction." %
+                    type(self.function))
+
+    def __getinitargs__(self):
+        return (self.function, )
+
+    def stringifier(self):
+        return StringifyMapper
+
+    mapper_method = intern("map_scoped_function")
+
 # }}}
 
 
@@ -650,9 +719,12 @@ def get_dependencies(expr):
 # {{{ rule-aware mappers
 
 def parse_tagged_name(expr):
+    from loopy.library.reduction import ArgExtOp, SegmentedOp
     if isinstance(expr, TaggedVariable):
         return expr.name, expr.tag
-    elif isinstance(expr, p.Variable):
+    elif isinstance(expr, ScopedFunction):
+        return parse_tagged_name(expr.function)
+    elif isinstance(expr, (p.Variable, ArgExtOp, SegmentedOp)):
         return expr.name, None
     else:
         raise RuntimeError("subst rule name not understood: %s" % expr)
@@ -1100,6 +1172,14 @@ class FunctionToPrimitiveMapper(IdentityMapper):
             else:
                 return IdentityMapper.map_call(self, expr)
 
+    def map_call_with_kwargs(self, expr):
+        for par in expr.kw_parameters.values():
+            if not isinstance(par, SubArrayRef):
+                raise LoopyError("Keyword Arguments is only supported for"
+                        " array arguments--use positional order to specify"
+                        " the order of the arguments in the call.")
+        return IdentityMapper.map_call_with_kwargs(self, expr)
+
 
 # {{{ customization to pymbolic parser
 
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index a81354e2f..9733fa446 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -150,7 +150,12 @@ class ASTBuilderBase(object):
 
     # {{{ library
 
-    def function_manglers(self):
+    def function_scopers(self):
+        """
+        Returns an instance of list of the functions of signature
+        ``(target, identifiers)`` returning either an instance of
+        :class:`InKernelCallable` if a match is found or *None*.
+        """
         return []
 
     def symbol_manglers(self):
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 83efecf0e..eab1e6afc 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -27,7 +27,6 @@ THE SOFTWARE.
 import six
 
 import numpy as np  # noqa
-from loopy.kernel.data import CallMangleInfo
 from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder
 from loopy.diagnostic import LoopyError, LoopyTypeError
 from cgen import Pointer, NestedDeclarator, Block
@@ -35,6 +34,7 @@ from cgen.mapper import IdentityMapper as CASTIdentityMapperBase
 from pymbolic.mapper.stringifier import PREC_NONE
 from loopy.symbolic import IdentityMapper
 from loopy.types import NumpyType
+from loopy.kernel.function_interface import ScalarCallable
 import pymbolic.primitives as p
 
 from pytools import memoize_method
@@ -354,71 +354,105 @@ def c_symbol_mangler(kernel, name):
 # }}}
 
 
-# {{{ function mangler
+# {{{ function scoping
 
-def c_math_mangler(target, name, arg_dtypes, modify_name=True):
-    # Function mangler for math functions defined in C standard
-    # Convert abs, min, max to fabs, fmin, fmax.
-    # If modify_name is set to True, function names are modified according to
-    # floating point types of the arguments (e.g. cos(double), cosf(float))
-    # This should be set to True for C and Cuda, False for OpenCL
-    if not isinstance(name, str):
-        return None
+class CMathCallable(ScalarCallable):
+    """
+    An umbrella callable for all the math functions which can be seen in a
+    C-Target.
+    """
 
-    if name in ["abs", "min", "max"]:
-        name = "f" + name
+    def with_types(self, arg_id_to_dtype, kernel):
+        name = self.name
 
-    # unitary functions
-    if (name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh",
-                 "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"]
-            and len(arg_dtypes) == 1
-            and arg_dtypes[0].numpy_dtype.kind == "f"):
+        if name in ["abs", "min", "max"]:
+            name = "f" + name
 
-        dtype = arg_dtypes[0].numpy_dtype
+        # unary functions
+        if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh",
+                    "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"]:
 
-        if modify_name:
-            if dtype == np.float64:
-                pass  # fabs
-            elif dtype == np.float32:
-                name = name + "f"  # fabsf
-            elif dtype == np.float128:
-                name = name + "l"  # fabsl
-            else:
-                raise LoopyTypeError("%s does not support type %s" % (name, dtype))
+            for id in arg_id_to_dtype:
+                if not -1 <= id <= 0:
+                    raise LoopyError("%s can take only one argument." % name)
 
-        return CallMangleInfo(
-                target_name=name,
-                result_dtypes=arg_dtypes,
-                arg_dtypes=arg_dtypes)
+            if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None:
+                # the types provided aren't mature enough to specialize the
+                # callable
+                return self.copy(arg_id_to_dtype=arg_id_to_dtype)
 
-    # binary functions
-    if (name in ["fmax", "fmin"]
-            and len(arg_dtypes) == 2):
+            dtype = arg_id_to_dtype[0]
+            dtype = dtype.numpy_dtype
 
-        dtype = np.find_common_type(
-            [], [dtype.numpy_dtype for dtype in arg_dtypes])
-
-        if dtype.kind == "c":
-            raise LoopyTypeError("%s does not support complex numbers")
+            if dtype.kind in ('u', 'i'):
+                # ints and unsigned casted to float32
+                dtype = np.float32
+            elif dtype.kind == 'c':
+                raise LoopyTypeError("%s does not support type %s" % (name, dtype))
 
-        elif dtype.kind == "f":
-            if modify_name:
+            from loopy.target.opencl import OpenCLTarget
+            if not isinstance(kernel.target, OpenCLTarget):
+                # for CUDA, C Targets the name must be modified
                 if dtype == np.float64:
-                    pass  # fmin
+                    pass  # fabs
                 elif dtype == np.float32:
-                    name = name + "f"  # fminf
+                    name = name + "f"  # fabsf
                 elif dtype == np.float128:
-                    name = name + "l"  # fminl
+                    name = name + "l"  # fabsl
                 else:
-                    raise LoopyTypeError("%s does not support type %s"
-                                         % (name, dtype))
+                    raise LoopyTypeError("%s does not support type %s" % (name,
+                        dtype))
+
+            return self.copy(name_in_target=name,
+                    arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)})
+
+        # binary functions
+        if name in ["fmax", "fmin"]:
+
+            for id in arg_id_to_dtype:
+                if not -1 <= id <= 1:
+                    raise LoopyError("%s can take only two arguments." % name)
+
+            if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or (
+                    arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None):
+                # the types provided aren't mature enough to specialize the
+                # callable
+                return self.copy(arg_id_to_dtype=arg_id_to_dtype)
+
+            dtype = np.find_common_type(
+                [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items()
+                     if id >= 0])
+
+            if dtype.kind == "c":
+                raise LoopyTypeError("%s does not support complex numbers")
+
+            elif dtype.kind == "f":
+                from loopy.target.opencl import OpenCLTarget
+                if not isinstance(kernel.target, OpenCLTarget):
+                    if dtype == np.float64:
+                        pass  # fmin
+                    elif dtype == np.float32:
+                        name = name + "f"  # fminf
+                    elif dtype == np.float128:
+                        name = name + "l"  # fminl
+                    else:
+                        raise LoopyTypeError("%s does not support type %s"
+                                             % (name, dtype))
+            dtype = NumpyType(dtype)
+            return self.copy(name_in_target=name,
+                    arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype})
+
+        return self.copy(arg_id_to_dtype=arg_id_to_dtype)
 
-            result_dtype = NumpyType(dtype)
-            return CallMangleInfo(
-                    target_name=name,
-                    result_dtypes=(result_dtype,),
-                    arg_dtypes=2*(result_dtype,))
 
+def scope_c_math_functions(target, identifier):
+    """
+    Returns an instance of :class:`InKernelCallable` if the function
+    represented by :arg:`identifier` is known in C, otherwise returns *None*.
+    """
+    if identifier in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh",
+            "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "max", "min"]:
+        return CMathCallable(name=identifier)
     return None
 
 # }}}
@@ -427,12 +461,6 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True):
 class CASTBuilder(ASTBuilderBase):
     # {{{ library
 
-    def function_manglers(self):
-        return (
-                super(CASTBuilder, self).function_manglers() + [
-                    c_math_mangler
-                    ])
-
     def symbol_manglers(self):
         return (
                 super(CASTBuilder, self).symbol_manglers() + [
@@ -445,6 +473,11 @@ class CASTBuilder(ASTBuilderBase):
                     _preamble_generator,
                     ])
 
+    def function_scopers(self):
+        return (
+                super(CASTBuilder, self).function_scopers() + [
+                    scope_c_math_functions])
+
     # }}}
 
     # {{{ code generation
@@ -846,82 +879,30 @@ class CASTBuilder(ASTBuilderBase):
         return block_if_necessary(assignments)
 
     def emit_multiple_assignment(self, codegen_state, insn):
-        ecm = codegen_state.expression_to_code_mapper
 
-        from pymbolic.primitives import Variable
-        from pymbolic.mapper.stringifier import PREC_NONE
-
-        func_id = insn.expression.function
-        parameters = insn.expression.parameters
-
-        if isinstance(func_id, Variable):
-            func_id = func_id.name
-
-        assignee_var_descriptors = [
-                codegen_state.kernel.get_var_descriptor(a)
-                for a in insn.assignee_var_names()]
-
-        par_dtypes = tuple(ecm.infer_type(par) for par in parameters)
-
-        mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes)
-        if mangle_result is None:
-            raise RuntimeError("function '%s' unknown--"
-                    "maybe you need to register a function mangler?"
-                    % func_id)
-
-        assert mangle_result.arg_dtypes is not None
+        ecm = codegen_state.expression_to_code_mapper
+        func_id = insn.expression.function.name
+        in_knl_callable = codegen_state.kernel.scoped_functions[func_id]
 
-        if mangle_result.target_name == "loopy_make_tuple":
-            # This shorcut avoids actually having to emit a 'make_tuple' function.
+        if in_knl_callable.name_in_target == 'loopy_make_tuple':
             return self.emit_tuple_assignment(codegen_state, insn)
 
-        from loopy.expression import dtype_to_type_context
-        c_parameters = [
-                ecm(par, PREC_NONE,
-                    dtype_to_type_context(self.target, tgt_dtype),
-                    tgt_dtype).expr
-                for par, par_dtype, tgt_dtype in zip(
-                    parameters, par_dtypes, mangle_result.arg_dtypes)]
-
-        from loopy.codegen import SeenFunction
-        codegen_state.seen_functions.add(
-                SeenFunction(func_id,
-                    mangle_result.target_name,
-                    mangle_result.arg_dtypes))
+        in_knl_callable_as_call, is_returned = in_knl_callable.emit_call_insn(
+                insn=insn,
+                target=self.target,
+                expression_to_code_mapper=ecm)
 
-        from pymbolic import var
-        for i, (a, tgt_dtype) in enumerate(
-                zip(insn.assignees[1:], mangle_result.result_dtypes[1:])):
-            if tgt_dtype != ecm.infer_type(a):
-                raise LoopyError("type mismatch in %d'th (1-based) left-hand "
-                        "side of instruction '%s'" % (i+1, insn.id))
-            c_parameters.append(
-                        # TODO Yuck: The "where-at function": &(...)
-                        var("&")(
-                            ecm(a, PREC_NONE,
-                                dtype_to_type_context(self.target, tgt_dtype),
-                                tgt_dtype).expr))
-
-        from pymbolic import var
-        result = var(mangle_result.target_name)(*c_parameters)
-
-        # In case of no assignees, we are done
-        if len(mangle_result.result_dtypes) == 0:
+        if is_returned:
+            from cgen import Assign
+            lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None)
+            return Assign(lhs_code,
+                    CExpression(self.get_c_expression_to_code_mapper(),
+                    in_knl_callable_as_call))
+        else:
             from cgen import ExpressionStatement
             return ExpressionStatement(
-                    CExpression(self.get_c_expression_to_code_mapper(), result))
-
-        result = ecm.wrap_in_typecast(
-                mangle_result.result_dtypes[0],
-                assignee_var_descriptors[0].dtype,
-                result)
-
-        lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None)
-
-        from cgen import Assign
-        return Assign(
-                lhs_code,
-                CExpression(self.get_c_expression_to_code_mapper(), result))
+                    CExpression(self.get_c_expression_to_code_mapper(),
+                    in_knl_callable_as_call))
 
     def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
             lbound, ubound, inner):
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index dd2104d0c..ecb6ad7d9 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -41,7 +41,7 @@ from pymbolic import var
 from loopy.expression import dtype_to_type_context
 from loopy.type_inference import TypeInferenceMapper
 
-from loopy.diagnostic import LoopyError, LoopyWarning
+from loopy.diagnostic import LoopyError
 from loopy.tools import is_integer
 from loopy.types import LoopyType
 
@@ -383,19 +383,18 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                         "for constant '%s'" % expr)
 
     def map_call(self, expr, type_context):
-        from pymbolic.primitives import Variable, Subscript
-
-        identifier = expr.function
+        from pymbolic.primitives import Subscript
 
         # {{{ implement indexof, indexof_vec
 
-        if identifier.name in ["indexof", "indexof_vec"]:
+        identifier_name = self.kernel.scoped_functions[expr.function.name].name
+        if identifier_name in ["indexof", "indexof_vec"]:
             if len(expr.parameters) != 1:
-                raise LoopyError("%s takes exactly one argument" % identifier.name)
+                raise LoopyError("%s takes exactly one argument" % identifier_name)
             arg, = expr.parameters
             if not isinstance(arg, Subscript):
                 raise LoopyError(
-                        "argument to %s must be a subscript" % identifier.name)
+                        "argument to %s must be a subscript" % identifier_name)
 
             ary = self.find_array(arg)
 
@@ -407,11 +406,11 @@ class ExpressionToCExpressionMapper(IdentityMapper):
 
             from loopy.kernel.data import ImageArg
             if isinstance(ary, ImageArg):
-                raise LoopyError("%s does not support images" % identifier.name)
+                raise LoopyError("%s does not support images" % identifier_name)
 
-            if identifier.name == "indexof":
+            if identifier_name == "indexof":
                 return access_info.subscripts[0]
-            elif identifier.name == "indexof_vec":
+            elif identifier_name == "indexof_vec":
                 from loopy.kernel.array import VectorArrayDimTag
                 ivec = None
                 for iaxis, dim_tag in enumerate(ary.dim_tags):
@@ -430,56 +429,21 @@ class ExpressionToCExpressionMapper(IdentityMapper):
 
         # }}}
 
-        if isinstance(identifier, Variable):
-            identifier = identifier.name
-
-        par_dtypes = tuple(self.infer_type(par) for par in expr.parameters)
-
-        processed_parameters = None
-
-        mangle_result = self.kernel.mangle_function(
-                identifier, par_dtypes,
-                ast_builder=self.codegen_state.ast_builder)
-
-        if mangle_result is None:
-            raise RuntimeError("function '%s' unknown--"
-                    "maybe you need to register a function mangler?"
-                    % identifier)
-
-        if len(mangle_result.result_dtypes) != 1:
-            raise LoopyError("functions with more or fewer than one return value "
-                    "may not be used in an expression")
-
-        if mangle_result.arg_dtypes is not None:
-            processed_parameters = tuple(
-                    self.rec(par,
-                        dtype_to_type_context(self.kernel.target, tgt_dtype),
-                        tgt_dtype)
-                    for par, par_dtype, tgt_dtype in zip(
-                        expr.parameters, par_dtypes, mangle_result.arg_dtypes))
-
-        else:
-            # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to
-            # propagate the type context here. But for many others, it does
-            # not. Using the inferred type as a stopgap for now.
-            processed_parameters = tuple(
-                    self.rec(par,
-                        type_context=dtype_to_type_context(
-                            self.kernel.target, par_dtype))
-                    for par, par_dtype in zip(expr.parameters, par_dtypes))
-
-            from warnings import warn
-            warn("Calling function '%s' with unknown C signature--"
-                    "return CallMangleInfo.arg_dtypes"
-                    % identifier, LoopyWarning)
-
-        from loopy.codegen import SeenFunction
-        self.codegen_state.seen_functions.add(
-                SeenFunction(identifier,
-                    mangle_result.target_name,
-                    mangle_result.arg_dtypes or par_dtypes))
-
-        return var(mangle_result.target_name)(*processed_parameters)
+        from loopy.kernel.function_interface import ManglerCallable
+        if isinstance(self.kernel.scoped_functions[expr.function.name],
+                ManglerCallable):
+            from loopy.codegen import SeenFunction
+            in_knl_callable = self.kernel.scoped_functions[expr.function.name]
+            mangle_result = in_knl_callable.mangle_result(self.kernel)
+            self.codegen_state.seen_functions.add(
+                    SeenFunction(identifier_name,
+                        mangle_result.target_name,
+                        mangle_result.arg_dtypes))
+
+        return self.kernel.scoped_functions[expr.function.name].emit_call(
+                expression_to_code_mapper=self,
+                expression=expr,
+                target=self.kernel.target)
 
     # {{{ deal with complex-valued variables
 
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 673d3b284..b2e4118d2 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -34,6 +34,7 @@ from loopy.diagnostic import LoopyError
 from loopy.types import NumpyType
 from loopy.kernel.data import AddressSpace
 from pymbolic import var
+from loopy.kernel.function_interface import ScalarCallable
 
 
 # {{{ vector types
@@ -111,29 +112,71 @@ def _register_vector_types(dtype_registry):
 # }}}
 
 
-# {{{ function mangler
+# {{{ function scoper
 
-def cuda_function_mangler(kernel, name, arg_dtypes):
-    if not isinstance(name, str):
-        return None
+_CUDA_SPECIFIC_FUNCTIONS = {
+        "rsqrt": 1,
+        "atan2": 2,
+        }
 
-    if name in ["max", "min"] and len(arg_dtypes) == 2:
-        dtype = np.find_common_type([], arg_dtypes)
 
-        if dtype.kind == "c":
-            raise RuntimeError("min/max do not support complex numbers")
+class CudaCallable(ScalarCallable):
 
-        if dtype.kind == "f":
-            name = "f" + name
+    def cuda_with_types(self, arg_id_to_dtype, kernel):
 
-        return dtype, name
+        name = self.name
 
-    if name in "atan2" and len(arg_dtypes) == 2:
-        return arg_dtypes[0], name
+        if name == "dot":
+            for id in arg_id_to_dtype:
+                if not -1 <= id <= 1:
+                    raise LoopyError("%s can take only 2 arguments." % name)
 
-    if name == "dot":
-        scalar_dtype, offset, field_name = arg_dtypes[0].fields["x"]
-        return scalar_dtype, name
+            if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or (
+                    arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None):
+                # the types provided aren't mature enough to specialize the
+                # callable
+                return self.copy(arg_id_to_dtype=arg_id_to_dtype)
+
+            dtype = arg_id_to_dtype[0]
+            scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["x"]
+            return self.copy(name_in_target=name,
+                    arg_id_to_dtype={-1: NumpyType(scalar_dtype),
+                        0: dtype, 1: dtype})
+
+        if name in _CUDA_SPECIFIC_FUNCTIONS:
+            num_args = _CUDA_SPECIFIC_FUNCTIONS[name]
+            for id in arg_id_to_dtype:
+                if not -1 <= id < num_args:
+                    raise LoopyError("%s can take only %d arguments." % (name,
+                            num_args))
+
+            for i in range(num_args):
+                if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None:
+                    # the types provided aren't mature enough to specialize the
+                    # callable
+                    return self.copy(arg_id_to_dtype=arg_id_to_dtype)
+
+            dtype = np.find_common_type(
+                    [], [dtype.numpy_dtype for id, dtype in
+                        arg_id_to_dtype.items() if id >= 0])
+
+            if dtype.kind == "c":
+                raise LoopyError("%s does not support complex numbers"
+                        % name)
+
+            updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1,
+                num_args))
+
+            return self.copy(name_in_target=name,
+                    arg_id_to_dtype=updated_arg_id_to_dtype)
+
+        return self.copy(arg_id_to_dtype=arg_id_to_dtype)
+
+
+def scope_cuda_functions(target, identifier):
+    if identifier in set(["dot"]) | set(
+            _CUDA_SPECIFIC_FUNCTIONS):
+        return CudaCallable(name=identifier)
 
     return None
 
@@ -217,13 +260,12 @@ class CudaTarget(CTarget):
 # {{{ ast builder
 
 class CUDACASTBuilder(CASTBuilder):
+
     # {{{ library
 
-    def function_manglers(self):
-        return (
-                super(CUDACASTBuilder, self).function_manglers() + [
-                    cuda_function_mangler
-                    ])
+    def function_scopers(self):
+        return [scope_cuda_functions] + (
+                super(CUDACASTBuilder, self).function_scopers())
 
     # }}}
 
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 432c95ef3..de07adf97 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -31,11 +31,11 @@ from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
 from pytools import memoize_method
 from loopy.diagnostic import LoopyError
 from loopy.types import NumpyType
-from loopy.target.c import DTypeRegistryWrapper, c_math_mangler
-from loopy.kernel.data import AddressSpace, CallMangleInfo
+from loopy.target.c import DTypeRegistryWrapper
+from loopy.kernel.data import AddressSpace
+from loopy.kernel.function_interface import ScalarCallable
 from pymbolic import var
 
-from functools import partial
 
 # {{{ dtype registry wrappers
 
@@ -166,59 +166,117 @@ VECTOR_LITERAL_FUNCS = dict(
         )
 
 
-def opencl_function_mangler(kernel, name, arg_dtypes):
-    if not isinstance(name, str):
-        return None
+class OpenCLCallable(ScalarCallable):
+    """
+    Records information about OpenCL functions which are not covered by
+    :class:`loopy.target.c.CMathCallable`.
+    """
+
+    def with_types(self, arg_id_to_dtype, kernel):
+        name = self.name
+
+        if name in ["max", "min"]:
+            for id in arg_id_to_dtype:
+                if not -1 <= id <= 1:
+                    raise LoopyError("%s can take only 2 arguments." % name)
+            if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype:
+                return self.copy(arg_id_to_dtype=arg_id_to_dtype)
+
+            dtype = np.find_common_type(
+                    [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items()
+                        if (id >= 0 and dtype is not None)])
+
+            if dtype.kind in ['u', 'i', 'f']:
+                if dtype.kind == 'f':
+                    name = 'f'+name
+                dtype = NumpyType(dtype)
+                return self.copy(name_in_target=name,
+                        arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype})
+            else:
+                # Unsupported type.
+                raise LoopyError("%s function not supported for the types %s" %
+                        (name, dtype))
+
+        if name == "dot":
+            for id in arg_id_to_dtype:
+                if not -1 <= id <= 1:
+                    raise LoopyError("%s can take only 2 arguments." % name)
+
+            if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or (
+                    arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None):
+                # the types provided aren't mature enough to specialize the
+                # callable
+                return self.copy(arg_id_to_dtype=arg_id_to_dtype)
+
+            dtype = arg_id_to_dtype[0]
+            scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"]
+            return self.copy(name_in_target=name, arg_id_to_dtype={-1:
+                NumpyType(scalar_dtype), 0: dtype, 1: dtype})
+
+        if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS:
+            num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name]
+            for id in arg_id_to_dtype:
+                if not -1 <= id < num_args:
+                    raise LoopyError("%s can take only %d arguments." % (name,
+                            num_args))
+
+            for i in range(num_args):
+                if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None:
+                    # the types provided aren't mature enough to specialize the
+                    # callable
+                    return self.copy(arg_id_to_dtype=arg_id_to_dtype)
+
+            dtype = np.find_common_type(
+                    [], [dtype.numpy_dtype for id, dtype in
+                        arg_id_to_dtype.items() if id >= 0])
+
+            if dtype.kind == "c":
+                raise LoopyError("%s does not support complex numbers"
+                        % name)
+
+            updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1,
+                num_args))
+
+            return self.copy(name_in_target=name,
+                    arg_id_to_dtype=updated_arg_id_to_dtype)
+
+        if name in VECTOR_LITERAL_FUNCS:
+            base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name]
+
+            for id in arg_id_to_dtype:
+                if not -1 <= id < count:
+                    raise LoopyError("%s can take only %d arguments." % (name,
+                            num_args))
+
+            for i in range(count):
+                if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None:
+                    # the types provided aren't mature enough to specialize the
+                    # callable
+                    return self.copy(arg_id_to_dtype=arg_id_to_dtype)
+
+            updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in
+                    range(count))
+            updated_arg_id_to_dtype[-1] = OpenCLTarget().vector_dtype(
+                        NumpyType(dtype), count)
+
+            return self.copy(name_in_target="(%s%d) " % (base_tp_name, count),
+                    arg_id_to_dtype=updated_arg_id_to_dtype)
+
+        # does not satisfy any of the conditions needed for specialization.
+        # hence just returning a copy of the callable.
+        return self.copy(arg_id_to_dtype=arg_id_to_dtype)
+
+
+def scope_opencl_functions(target, identifier):
+    """
+    Returns an instance of :class:`InKernelCallable` if the function defined by
+    *identifier* is known in OpenCL.
+    """
+    opencl_function_ids = set(["max", "min", "dot"]) | set(
+            _CL_SIMPLE_MULTI_ARG_FUNCTIONS) | set(VECTOR_LITERAL_FUNCS)
 
-    # OpenCL has min(), max() for integer types
-    if name in ["max", "min"] and len(arg_dtypes) == 2:
-        dtype = np.find_common_type(
-                [], [dtype.numpy_dtype for dtype in arg_dtypes])
-
-        if dtype.kind == "i":
-            result_dtype = NumpyType(dtype)
-            return CallMangleInfo(
-                    target_name=name,
-                    result_dtypes=(result_dtype,),
-                    arg_dtypes=2*(result_dtype,))
-
-    if name == "dot":
-        scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"]
-        return CallMangleInfo(
-                target_name=name,
-                result_dtypes=(NumpyType(scalar_dtype),),
-                arg_dtypes=(arg_dtypes[0],)*2)
-
-    if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS:
-        num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name]
-        if len(arg_dtypes) != num_args:
-            raise LoopyError("%s takes %d arguments (%d received)"
-                    % (name, num_args, len(arg_dtypes)))
-
-        dtype = np.find_common_type(
-                [], [dtype.numpy_dtype for dtype in arg_dtypes])
-
-        if dtype.kind == "c":
-            raise LoopyError("%s does not support complex numbers"
-                    % name)
-
-        result_dtype = NumpyType(dtype)
-        return CallMangleInfo(
-                target_name=name,
-                result_dtypes=(result_dtype,),
-                arg_dtypes=(result_dtype,)*num_args)
-
-    if name in VECTOR_LITERAL_FUNCS:
-        base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name]
-
-        if count != len(arg_dtypes):
-            return None
-
-        return CallMangleInfo(
-                target_name="(%s%d) " % (base_tp_name, count),
-                result_dtypes=(kernel.target.vector_dtype(
-                    NumpyType(dtype), count),),
-                arg_dtypes=(NumpyType(dtype),)*count)
+    if identifier in opencl_function_ids:
+        return OpenCLCallable(name=identifier)
 
     return None
 
@@ -365,13 +423,10 @@ class OpenCLTarget(CTarget):
 class OpenCLCASTBuilder(CASTBuilder):
     # {{{ library
 
-    def function_manglers(self):
+    def function_scopers(self):
         return (
-                [
-                    opencl_function_mangler,
-                    partial(c_math_mangler, modify_name=False)
-                ] +
-                super(OpenCLCASTBuilder, self).function_manglers())
+                [scope_opencl_functions] + super(
+                    OpenCLCASTBuilder, self).function_scopers())
 
     def symbol_manglers(self):
         return (
@@ -380,13 +435,10 @@ class OpenCLCASTBuilder(CASTBuilder):
                     ])
 
     def preamble_generators(self):
-        from loopy.library.reduction import reduction_preamble_generator
 
         return (
                 super(OpenCLCASTBuilder, self).preamble_generators() + [
-                    opencl_preamble_generator,
-                    reduction_preamble_generator,
-                    ])
+                    opencl_preamble_generator])
 
     # }}}
 
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index 73e8e0092..27c4f4ab4 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -31,12 +31,12 @@ from six.moves import range
 
 import numpy as np
 
-from loopy.kernel.data import CallMangleInfo
 from loopy.target.opencl import OpenCLTarget, OpenCLCASTBuilder
 from loopy.target.python import PythonASTBuilderBase
 from loopy.types import NumpyType
-from loopy.diagnostic import LoopyError, warn_with_kernel
+from loopy.diagnostic import LoopyError, warn_with_kernel, LoopyTypeError
 from warnings import warn
+from loopy.kernel.function_interface import ScalarCallable
 
 import logging
 logger = logging.getLogger(__name__)
@@ -199,37 +199,79 @@ def check_sizes(kernel, device):
 # }}}
 
 
-def pyopencl_function_mangler(target, name, arg_dtypes):
-    if len(arg_dtypes) == 1 and isinstance(name, str):
-        arg_dtype, = arg_dtypes
+# {{{ pyopencl function scopers
 
-        if arg_dtype.is_complex():
-            if arg_dtype.numpy_dtype == np.complex64:
-                tpname = "cfloat"
-            elif arg_dtype.numpy_dtype == np.complex128:
-                tpname = "cdouble"
+class PyOpenCLCallable(ScalarCallable):
+    """
+    Records information about the callables which are not covered by
+    :class:`loopy.target.opencl.OpenCLCallable`
+    """
+    def with_types(self, arg_id_to_dtype, kernel):
+
+        name = self.name
+
+        for id in arg_id_to_dtype:
+            # since all the below functions are single arg.
+            if not -1 <= id <= 0:
+                raise LoopyError("%s can only take one argument." % name)
+
+        if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None:
+            # the types provided aren't mature enough to specialize the
+            # callable
+            return self.copy(arg_id_to_dtype=arg_id_to_dtype)
+
+        dtype = arg_id_to_dtype[0]
+
+        if name in ["real", "imag", "abs"]:
+            if dtype.is_complex():
+                if dtype.numpy_dtype == np.complex64:
+                    tpname = "cfloat"
+                elif dtype.numpy_dtype == np.complex128:
+                    tpname = "cdouble"
+                else:
+                    raise LoopyTypeError("unexpected complex type '%s'" % dtype)
+
+                return self.copy(name_in_target="%s_%s" % (tpname, name),
+                        arg_id_to_dtype={0: dtype, -1: NumpyType(
+                                np.dtype(dtype.numpy_dtype.type(0).real))})
+
+        if name in ["sqrt", "exp", "log",
+                "sin", "cos", "tan",
+                "sinh", "cosh", "tanh",
+                "conj", "abs"]:
+            if dtype.is_complex():
+                # function parameters are complex.
+                if dtype.numpy_dtype == np.complex64:
+                    tpname = "cfloat"
+                elif dtype.numpy_dtype == np.complex128:
+                    tpname = "cdouble"
+                else:
+                    raise LoopyTypeError("unexpected complex type '%s'" % dtype)
+
+                return self.copy(name_in_target="%s_%s" % (tpname, name),
+                        arg_id_to_dtype={0: dtype, -1: dtype})
             else:
-                raise RuntimeError("unexpected complex type '%s'" % arg_dtype)
-
-            if name in ["sqrt", "exp", "log",
-                    "sin", "cos", "tan",
-                    "sinh", "cosh", "tanh",
-                    "conj"]:
-                return CallMangleInfo(
-                        target_name="%s_%s" % (tpname, name),
-                        result_dtypes=(arg_dtype,),
-                        arg_dtypes=(arg_dtype,))
-
-            if name in ["real", "imag", "abs"]:
-                return CallMangleInfo(
-                        target_name="%s_%s" % (tpname, name),
-                        result_dtypes=(NumpyType(
-                            np.dtype(arg_dtype.numpy_dtype.type(0).real)),
-                            ),
-                        arg_dtypes=(arg_dtype,))
+                # function calls for floating parameters.
+                numpy_dtype = dtype.numpy_dtype
+                if numpy_dtype.kind in ('u', 'i'):
+                    dtype = dtype.copy(numpy_dtype=np.float32)
+                if name == 'abs':
+                    name = 'fabs'
+                return self.copy(name_in_target=name,
+                    arg_id_to_dtype={0: dtype, -1: dtype})
+
+        return self.copy(arg_id_to_dtype=arg_id_to_dtype)
+
+
+def pyopencl_function_scoper(target, identifier):
+    if identifier in ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh",
+            "tanh", "conj", "real", "imag", "abs"]:
+        return PyOpenCLCallable(name=identifier)
 
     return None
 
+# }}}
+
 
 # {{{ preamble generator
 
@@ -739,19 +781,15 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder):
 
     # {{{ library
 
-    def function_manglers(self):
-        from loopy.library.random123 import random123_function_mangler
+    def function_scopers(self):
+        from loopy.library.random123 import random123_function_scoper
         return (
-                super(PyOpenCLCASTBuilder, self).function_manglers() + [
-                    pyopencl_function_mangler,
-                    random123_function_mangler
-                    ])
+                [pyopencl_function_scoper, random123_function_scoper] + super(
+                    PyOpenCLCASTBuilder, self).function_scopers())
 
     def preamble_generators(self):
-        from loopy.library.random123 import random123_preamble_generator
         return ([
             pyopencl_preamble_generator,
-            random123_preamble_generator,
             ] + super(PyOpenCLCASTBuilder, self).preamble_generators())
 
     # }}}
diff --git a/loopy/target/python.py b/loopy/target/python.py
index ce04986d3..2804b0fb9 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -82,47 +82,35 @@ class ExpressionToPythonMapper(StringifyMapper):
                 expr, enclosing_prec)
 
     def map_call(self, expr, enclosing_prec):
-        from pymbolic.primitives import Variable
         from pymbolic.mapper.stringifier import PREC_NONE
 
-        identifier = expr.function
+        identifier_name = self.kernel.scoped_functions[expr.function.name].name
 
-        if identifier.name in ["indexof", "indexof_vec"]:
+        if identifier_name in ["indexof", "indexof_vec"]:
             raise LoopyError(
                     "indexof, indexof_vec not yet supported in Python")
 
-        if isinstance(identifier, Variable):
-            identifier = identifier.name
-
-        par_dtypes = tuple(self.type_inf_mapper(par) for par in expr.parameters)
+        from loopy.kernel.function_interface import ManglerCallable
+        in_knl_callable = self.kernel.scoped_functions[expr.function.name]
+        if isinstance(in_knl_callable, ManglerCallable):
+            from loopy.codegen import SeenFunction
+            mangle_result = in_knl_callable.mangle_result(self.kernel)
+            self.codegen_state.seen_functions.add(
+                    SeenFunction(identifier_name,
+                        mangle_result.target_name,
+                        mangle_result.arg_dtypes))
 
         str_parameters = None
+        number_of_assignees = len([key for key in
+            in_knl_callable.arg_id_to_dtype.keys() if key < 0])
 
-        mangle_result = self.kernel.mangle_function(
-                identifier, par_dtypes,
-                ast_builder=self.codegen_state.ast_builder)
-
-        if mangle_result is None:
-            raise RuntimeError("function '%s' unknown--"
-                    "maybe you need to register a function mangler?"
-                    % identifier)
-
-        if len(mangle_result.result_dtypes) != 1:
+        if number_of_assignees != 1:
             raise LoopyError("functions with more or fewer than one return value "
                     "may not be used in an expression")
 
-        str_parameters = [
-                self.rec(par, PREC_NONE)
-                for par, par_dtype, tgt_dtype in zip(
-                    expr.parameters, par_dtypes, mangle_result.arg_dtypes)]
+        str_parameters = [self.rec(par, PREC_NONE) for par in expr.parameters]
 
-        from loopy.codegen import SeenFunction
-        self.codegen_state.seen_functions.add(
-                SeenFunction(identifier,
-                    mangle_result.target_name,
-                    mangle_result.arg_dtypes or par_dtypes))
-
-        return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters))
+        return "%s(%s)" % (in_knl_callable.name_in_target, ", ".join(str_parameters))
 
     def map_group_hw_index(self, expr, enclosing_prec):
         raise LoopyError("plain Python does not have group hw axes")
@@ -189,11 +177,11 @@ class PythonASTBuilderBase(ASTBuilderBase):
 
     # {{{ code generation guts
 
-    def function_manglers(self):
+    def function_scopers(self):
+        from loopy.target.c import scope_c_math_functions
         return (
-                super(PythonASTBuilderBase, self).function_manglers() + [
-                    _numpy_single_arg_function_mangler,
-                    ])
+                super(PythonASTBuilderBase, self).function_scopers() +
+                [scope_c_math_functions])
 
     def preamble_generators(self):
         return (
diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py
index d4dcb3701..d0edcfd78 100644
--- a/loopy/transform/diff.py
+++ b/loopy/transform/diff.py
@@ -398,7 +398,14 @@ def diff_kernel(knl, diff_outputs, by, diff_iname_prefix="diff_i",
 
     # }}}
 
-    return diff_context.get_new_kernel(), result
+    # Differentiation lead to addition of new functions to the kernel.
+    # For example differentiating `sin(x)` -> `cos(x)`. Hence we would need to
+    # scope `cos(x)`.
+    from loopy.kernel.creation import scope_functions
+    differentiated_scoped_kernel = scope_functions(
+            diff_context.get_new_kernel())
+
+    return differentiated_scoped_kernel, result
 
 # }}}
 
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 010a0658f..a68520525 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -44,6 +44,19 @@ def _debug(kernel, s, *args):
         logger.debug("%s: %s" % (kernel.name, logstr))
 
 
+def get_return_types_as_tuple(arg_id_to_dtype):
+    """Returns the types of arguments in  a tuple format.
+
+    :param arg_id_to_dtype: An instance of :class:`dict` which denotes a
+                            mapping from the arguments to their inferred types.
+    """
+    return_arg_id_to_dtype = dict((id, dtype) for id, dtype in
+            arg_id_to_dtype.items() if (isinstance(id, int) and id < 0))
+    return_arg_pos = sorted(return_arg_id_to_dtype.keys(), reverse=True)
+
+    return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos)
+
+
 # {{{ type inference mapper
 
 class TypeInferenceMapper(CombineMapper):
@@ -60,6 +73,8 @@ class TypeInferenceMapper(CombineMapper):
             new_assignments = {}
         self.new_assignments = new_assignments
         self.symbols_with_unknown_types = set()
+        self.scoped_functions = kernel.scoped_functions
+        self.specialized_functions = {}
 
     def __call__(self, expr, return_tuple=False, return_dtype_set=False):
         kwargs = {}
@@ -250,15 +265,18 @@ class TypeInferenceMapper(CombineMapper):
         return self.rec(expr.aggregate)
 
     def map_call(self, expr, return_tuple=False):
-        from pymbolic.primitives import Variable
+        from pymbolic.primitives import Variable, CallWithKwargs
+        from loopy.symbolic import ScopedFunction
+
+        if isinstance(expr, CallWithKwargs):
+            kw_parameters = expr.kw_parameters
+        else:
+            kw_parameters = {}
 
         identifier = expr.function
-        if isinstance(identifier, Variable):
+        if isinstance(identifier, (Variable, ScopedFunction)):
             identifier = identifier.name
 
-        if identifier in ["indexof", "indexof_vec"]:
-            return [self.kernel.index_dtype]
-
         def none_if_empty(d):
             if d:
                 d, = d
@@ -266,25 +284,121 @@ class TypeInferenceMapper(CombineMapper):
             else:
                 return None
 
-        arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in expr.parameters)
-        if None in arg_dtypes:
-            return []
+        arg_id_to_dtype = dict((i, none_if_empty(self.rec(par))) for (i, par) in
+                tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items()))
+
+        # specializing the known function wrt type
+        if isinstance(expr.function, ScopedFunction):
+            in_knl_callable = self.scoped_functions[expr.function.name]
+
+            # {{{ checking that there is no overwriting of types of in_knl_callable
+
+            if in_knl_callable.arg_id_to_dtype is not None:
+
+                # specializing an already specialized function.
+                for id, dtype in arg_id_to_dtype.items():
+                    if in_knl_callable.arg_id_to_dtype[id] != arg_id_to_dtype[id]:
+
+                        # {{{ ignoring the the cases when there is a discrepancy
+                        # between np.uint and np.int
+
+                        import numpy as np
+                        if in_knl_callable.arg_id_to_dtype[id].dtype.type == (
+                                np.uint32) and (
+                                        arg_id_to_dtype[id].dtype.type == np.int32):
+                            continue
+                        if in_knl_callable.arg_id_to_dtype[id].dtype.type == (
+                                np.uint64) and (
+                                        arg_id_to_dtype[id].dtype.type ==
+                                        np.int64):
+                            continue
+
+                        # }}}
+
+                        raise LoopyError("Overwriting a specialized function "
+                                "is illegal--maybe start with new instance of "
+                                "InKernelCallable?")
+
+            # }}}
+
+            in_knl_callable = in_knl_callable.with_types(
+                        arg_id_to_dtype, self.kernel)
+
+            # storing the type specialized function so that it can be used for
+            # later use
+            self.specialized_functions[expr] = in_knl_callable.with_target(
+                    self.kernel.target)
+
+            new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype
+
+            if new_arg_id_to_dtype is None:
+                return []
+
+            # collecting result dtypes in order of the assignees
+            if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None:
+                if return_tuple:
+                    return [get_return_types_as_tuple(new_arg_id_to_dtype)]
+                else:
+                    return [new_arg_id_to_dtype[-1]]
+
+        elif isinstance(expr.function, Variable):
+            # Since, the function is not "scoped", attempt to infer using
+            # kernel.function_manglers
+
+            # {{{ trying to infer using function manglers
+
+            arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in
+                    expr.parameters)
+
+            # finding the function_mangler which would be associated with the
+            # realized function.
+            mangle_result = None
+            for function_mangler in self.kernel.function_manglers:
+                mangle_result = function_mangler(self.kernel, identifier,
+                        arg_dtypes)
+                if mangle_result:
+                    # found a match.
+                    break
 
-        mangle_result = self.kernel.mangle_function(identifier, arg_dtypes)
-        if return_tuple:
-            if mangle_result is not None:
-                return [mangle_result.result_dtypes]
-        else:
             if mangle_result is not None:
-                if len(mangle_result.result_dtypes) != 1 and not return_tuple:
-                    raise LoopyError("functions with more or fewer than one "
-                            "return value may only be used in direct assignments")
+                from loopy.kernel.function_interface import (ManglerCallable,
+                        ValueArgDescriptor)
+
+                # creating arg_id_to_dtype, arg_id_to_descr from arg_dtypes
+                arg_id_to_dtype = dict((i, dt.with_target(self.kernel.target))
+                        for i, dt in enumerate(mangle_result.arg_dtypes))
+                arg_id_to_dtype.update(dict((-i-1,
+                    dtype.with_target(self.kernel.target)) for i, dtype in enumerate(
+                        mangle_result.result_dtypes)))
+                arg_descrs = tuple((i, ValueArgDescriptor()) for i, _ in
+                        enumerate(mangle_result.arg_dtypes))
+                res_descrs = tuple((-i-1, ValueArgDescriptor()) for i, _ in
+                        enumerate(mangle_result.result_dtypes))
+                arg_id_to_descr = dict(arg_descrs+res_descrs)
+
+                # creating the ManglerCallable object corresponding to the
+                # function.
+                self.specialized_functions[expr] = ManglerCallable(
+                        identifier, function_mangler, arg_id_to_dtype,
+                        arg_id_to_descr, mangle_result.target_name)
+
+            # Returning the type.
+            if return_tuple:
+                if mangle_result is not None:
+                    return [mangle_result.result_dtypes]
+            else:
+                if mangle_result is not None:
+                    if len(mangle_result.result_dtypes) != 1 and not return_tuple:
+                        raise LoopyError("functions with more or fewer than one "
+                                "return value may only be used in direct "
+                                "assignments")
+
+                    return [mangle_result.result_dtypes[0]]
+            # }}}
 
-                return [mangle_result.result_dtypes[0]]
+        return []
 
-        raise RuntimeError("unable to resolve "
-                "function '%s' with %d given arguments"
-                % (identifier, len(arg_dtypes)))
+    map_call_with_kwargs = map_call
 
     def map_variable(self, expr):
         if expr.name in self.kernel.all_inames():
@@ -406,7 +520,7 @@ class TypeInferenceMapper(CombineMapper):
 
 def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
     if var_name in kernel.all_params():
-        return [kernel.index_dtype], []
+        return [kernel.index_dtype], [], {}
 
     from functools import partial
     debug = partial(_debug, kernel)
@@ -451,11 +565,12 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
         dtype_sets.append(result)
 
     if not dtype_sets:
-        return None, type_inf_mapper.symbols_with_unknown_types
+        return None, type_inf_mapper.symbols_with_unknown_types, None
 
     result = type_inf_mapper.combine(dtype_sets)
 
-    return result, type_inf_mapper.symbols_with_unknown_types
+    return (result, type_inf_mapper.symbols_with_unknown_types,
+            type_inf_mapper.specialized_functions)
 
 # }}}
 
@@ -553,6 +668,8 @@ def infer_unknown_types(kernel, expect_completion=False):
 
     from loopy.kernel.data import TemporaryVariable, KernelArgument
 
+    specialized_functions = {}
+
     for var_chain in sccs:
         changed_during_last_queue_run = False
         queue = var_chain[:]
@@ -576,7 +693,7 @@ def infer_unknown_types(kernel, expect_completion=False):
 
             debug("inferring type for %s %s", type(item).__name__, item.name)
 
-            result, symbols_with_unavailable_types = (
+            result, symbols_with_unavailable_types, new_specialized_functions = (
                     _infer_var_type(
                             kernel, item.name, type_inf_mapper, subst_expander))
 
@@ -597,6 +714,10 @@ def infer_unknown_types(kernel, expect_completion=False):
                         new_arg_dict[name] = item.copy(dtype=new_dtype)
                     else:
                         raise LoopyError("unexpected item type in type inference")
+                # TODO: I dont like in-place updates. Change this to something
+                # else. Perhaps add a function for doing this, which does it
+                # using a bunch of copies?
+                specialized_functions.update(new_specialized_functions)
             else:
                 debug("     failure")
 
@@ -639,11 +760,23 @@ def infer_unknown_types(kernel, expect_completion=False):
     logger.debug("type inference took {dur:.2f} seconds".format(
             dur=end_time - start_time))
 
-    return unexpanded_kernel.copy(
+    pre_type_specialized_knl = unexpanded_kernel.copy(
             temporary_variables=new_temp_vars,
             args=[new_arg_dict[arg.name] for arg in kernel.args],
             )
 
+    from loopy.kernel.function_interface import (
+            register_pymbolic_calls_to_knl_callables)
+    type_specialized_kernel = register_pymbolic_calls_to_knl_callables(
+            pre_type_specialized_knl, specialized_functions)
+    if expect_completion:
+        # if completion is expected, then it is important that all the
+        # callables are scoped.
+        from loopy.check import check_functions_are_scoped
+        check_functions_are_scoped(type_specialized_kernel)
+
+    return type_specialized_kernel
+
 # }}}
 
 
diff --git a/test/testlib.py b/test/testlib.py
index ad290ee7c..a22988ec8 100644
--- a/test/testlib.py
+++ b/test/testlib.py
@@ -1,4 +1,5 @@
 import loopy as lp
+import numpy as np
 
 
 # {{{ test_barrier_in_overridden_get_grid_size_expanded_kernel
@@ -132,4 +133,43 @@ class SeparateTemporariesPreambleTestPreambleGenerator(
 
 # }}}
 
+
+# {{{ test_register_function_lookup
+
+class Log2Callable(lp.ScalarCallable):
+
+    def with_types(self, arg_id_to_dtype, kernel):
+
+        if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None:
+            # the types provided aren't mature enough to specialize the
+            # callable
+            return self.copy(arg_id_to_dtype=arg_id_to_dtype)
+
+        dtype = arg_id_to_dtype[0].numpy_dtype
+
+        if dtype.kind in ('u', 'i'):
+            # ints and unsigned casted to float32
+            dtype = np.float32
+
+        from loopy.target.opencl import OpenCLTarget
+        name_in_target = "log2"
+        if not isinstance(kernel.target, OpenCLTarget):
+            # for CUDA, C Targets the name must be modified
+            if dtype == np.float32:
+                name_in_target = "log2f"
+            elif dtype == np.float128:
+                name_in_target = "log2l"
+
+        from loopy.types import NumpyType
+        return self.copy(name_in_target=name_in_target,
+                arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)})
+
+
+def register_log2_lookup(target, identifier):
+    if identifier == 'log2':
+        return Log2Callable(name='log2')
+    return None
+
+# }}}
+
 # vim: foldmethod=marker
-- 
GitLab