diff --git a/MEMO b/MEMO
index 02eaf2264a85c6a81c840c091dc90d288df25dc0..3e4be2755f1aee8932850938103155ed379cd1af 100644
--- a/MEMO
+++ b/MEMO
@@ -41,6 +41,10 @@ Things to consider
 To-do
 ^^^^^
 
+- Multi-domain
+
+- Kernel splitting (via what variables get computed in a kernel)
+
 - What if no universally valid precompute base index expression is found?
   (test_intel_matrix_mul with n = 6*16, e.g.?)
 
@@ -66,6 +70,8 @@ To-do
 Future ideas
 ^^^^^^^^^^^^
 
+- Array language
+
 - reg rolling
 
 - When duplicating inames, use iname aliases to relieve burden on isl
diff --git a/loopy/codegen/expression.py b/loopy/codegen/expression.py
index fad00937348d7bb419c2ae802f729a2054395ed5..5a387a68e64bdfca4fb65005605fc2606c94e779 100644
--- a/loopy/codegen/expression.py
+++ b/loopy/codegen/expression.py
@@ -90,6 +90,12 @@ class TypeInferenceMapper(CombineMapper):
         if expr.name in self.kernel.all_inames():
             return np.dtype(np.int16) # don't force single-precision upcast
 
+        for mangler in self.kernel.symbol_manglers:
+            result = mangler(expr.name)
+            if result is not None:
+                result_dtype, _ = result
+                return result_dtype
+
         raise TypeInferenceFailure("nothing known about '%s'" % expr.name)
 
     def map_lookup(self, expr):
@@ -162,14 +168,19 @@ class LoopyCCodeMapper(CCodeMapper):
                         self.rec(self.var_subst_map[expr.name], prec))
             else:
                 return str(self.rec(self.var_subst_map[expr.name], prec))
-        else:
-            if expr.name in self.kernel.arg_dict:
-                arg = self.kernel.arg_dict[expr.name]
-                from loopy.kernel import _ShapedArg
-                if isinstance(arg, _ShapedArg) and arg.shape == ():
-                    return "*"+expr.name
-
-            return CCodeMapper.map_variable(self, expr, prec)
+        elif expr.name in self.kernel.arg_dict:
+            arg = self.kernel.arg_dict[expr.name]
+            from loopy.kernel import _ShapedArg
+            if isinstance(arg, _ShapedArg) and arg.shape == ():
+                return "*"+expr.name
+
+        for mangler in self.kernel.symbol_manglers:
+            result = mangler(expr.name)
+            if result is not None:
+                _, c_name = result
+                return c_name
+
+        return CCodeMapper.map_variable(self, expr, prec)
 
     def map_tagged_variable(self, expr, enclosing_prec):
         return expr.name
diff --git a/loopy/compiled.py b/loopy/compiled.py
index 88382ba3842b79b5516719a3dbcf77c051da76bf..f2ac06851edfd0c35afb3248f8679b334fcf5b89 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -153,6 +153,15 @@ class CompiledKernel:
             is_written = arg.name in self.kernel.get_written_variables()
 
             val = kwargs_copy.pop(arg.name, None)
+
+            # automatically transfer host-side arrays
+            if isinstance(arg, lp.GlobalArg):
+                if isinstance(val, np.ndarray):
+                    # synchronous, so nothing to worry about
+                    val = cl_array.to_device(queue, val, allocator=allocator)
+                elif val is not None:
+                    encountered_non_numpy = True
+
             if val is None:
                 if not is_written:
                     raise TypeError("must supply input argument '%s'" % arg.name)
@@ -168,14 +177,6 @@ class CompiledKernel:
             else:
                 assert _arg_matches_spec(arg, val, kwargs)
 
-            # automatically transfer host-side arrays
-            if isinstance(arg, lp.GlobalArg):
-                if isinstance(val, np.ndarray):
-                    # synchronous, so nothing to worry about
-                    val = cl_array.to_device(queue, val, allocator=allocator)
-                else:
-                    encountered_non_numpy = True
-
             if is_written:
                 outputs.append(val)
 
@@ -196,7 +197,7 @@ class CompiledKernel:
                     *args,
                     g_times_l=True, wait_for=wait_for)
 
-        if out_host is None and encountered_non_numpy:
+        if out_host is None and not encountered_non_numpy:
             out_host = True
         if out_host:
             outputs = [o.get() for o in outputs]
diff --git a/loopy/kernel.py b/loopy/kernel.py
index 051b57fec954b1ece1c30f8c9296ac503a880fde..de67d95722336a0c4e292dfe5c81778e225cd2e1 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -114,7 +114,7 @@ def parse_tag(tag):
 
 # {{{ arguments
 
-class _ShapedArg:
+class _ShapedArg(object):
     def __init__(self, name, dtype, strides=None, shape=None, order="C",
             offset=0):
         """
@@ -192,7 +192,7 @@ class ConstantArg(_ShapedArg):
         return "<ConstantArg '%s' of type %s and shape (%s)>" % (
                 self.name, self.dtype, ",".join(str(i) for i in self.shape))
 
-class ImageArg:
+class ImageArg(object):
     def __init__(self, name, dtype, dimensions=None, shape=None):
         self.name = name
         self.dtype = np.dtype(dtype)
@@ -211,7 +211,7 @@ class ImageArg:
         return "<ImageArg '%s' of type %s>" % (self.name, self.dtype)
 
 
-class ScalarArg:
+class ScalarArg(object):
     def __init__(self, name, dtype, approximately=None):
         self.name = name
         self.dtype = np.dtype(dtype)
@@ -421,7 +421,7 @@ def expand_defines(insn, defines):
 
 # }}}
 
-# {{{ function manglers
+# {{{ function manglers / dtype getters
 
 def default_function_mangler(name, arg_dtypes):
     from loopy.reduction import reduction_function_mangler
@@ -441,6 +441,20 @@ def single_arg_function_mangler(name, arg_dtypes):
 
     return None
 
+def opencl_symbol_mangler(name):
+    # FIXME: should be more picky about exact names
+    if name.startswith("FLT_"):
+        return np.dtype(np.float32), name
+    elif name.startswith("DBL_"):
+        return np.dtype(np.float64), name
+    elif name.startswith("M_"):
+        if name.endswith("_F"):
+            return np.dtype(np.float32), name
+        else:
+            return np.dtype(np.float64), name
+    else:
+        return None
+
 # }}}
 
 # {{{ preamble generators
@@ -505,8 +519,11 @@ class LoopKernel(Record):
     :ivar substitutions: a mapping from substitution names to :class:`SubstitutionRule`
         objects
     :ivar function_manglers: list of functions of signature (name, arg_dtypes)
-        returning a tuple (result_dtype, function_name), where the function_name
+        returning a tuple (result_dtype, c_name), where c_name
         is the C-level function to be called.
+    :ivar symbol_manglers: list of functions of signature (name) returning
+        a tuple (result_dtype, c_name), where c_name is the C-level symbol to be
+        evaluated.
     :ivar defines: a dictionary of replacements to be made in instructions given
         as strings before parsing. A macro instance intended to be replaced should
         look like "{MACRO}" in the instruction code. The expansion given in this
@@ -541,6 +558,7 @@ class LoopKernel(Record):
             iname_to_tag={},
             substitutions={},
             function_manglers=[default_function_mangler, single_arg_function_mangler],
+            symbol_manglers=[opencl_symbol_mangler],
             defines={},
 
             # non-user-facing
@@ -758,7 +776,8 @@ class LoopKernel(Record):
                 lowest_priority_inames=lowest_priority_inames,
                 breakable_inames=breakable_inames,
                 applied_iname_rewrites=applied_iname_rewrites,
-                function_manglers=function_manglers)
+                function_manglers=function_manglers,
+                symbol_manglers=symbol_manglers)
 
     def make_unique_instruction_id(self, insns=None, based_on="insn", extra_used_ids=set()):
         if insns is None: