diff --git a/loopy/__init__.py b/loopy/__init__.py
index 50660f60a82bc4bc9ecb1d5a7102ac0afb42e40c..d1e8698be23a9ea2edf733433d38a534589591f1 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -131,6 +131,8 @@ from loopy.target.cuda import CudaTarget
 from loopy.target.opencl import OpenCLTarget
 from loopy.target.pyopencl import PyOpenCLTarget
 from loopy.target.ispc import ISPCTarget
+from loopy.target.numba import NumbaTarget, NumbaCudaTarget
+
 
 __all__ = [
         "TaggedVariable", "Reduction", "LinearSubscript",
@@ -228,6 +230,7 @@ __all__ = [
 
         "TargetBase", "CTarget", "CudaTarget", "OpenCLTarget",
         "PyOpenCLTarget", "ISPCTarget",
+        "NumbaTarget", "NumbaCudaTarget",
         "ASTBuilderBase",
 
         # {{{ from this file
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 71a1c4b3f5972b383af07ec1b07ad5ca3cba020b..e3bcea77b7083ac8e95e73dfa04a3f3cad40d5a3 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -502,23 +502,7 @@ def generate_code_v2(kernel):
     for prea_gen in preamble_generators:
         preambles.extend(prea_gen(preamble_info))
 
-    seen_preamble_tags = set()
-    dedup_preambles = []
-
-    for tag, preamble in sorted(preambles, key=lambda tag_code: tag_code[0]):
-        if tag in seen_preamble_tags:
-            continue
-
-        seen_preamble_tags.add(tag)
-        dedup_preambles.append(preamble)
-
-    from loopy.tools import remove_common_indentation
-    preamble_codes = [
-            remove_common_indentation(lines) + "\n"
-            for lines in dedup_preambles]
-
-    codegen_result = codegen_result.copy(
-            device_preambles=preamble_codes)
+    codegen_result = codegen_result.copy(device_preambles=preambles)
 
     # }}}
 
diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index 82dbe09986b2567e9951b2446313a8ef5fde4f8c..4f8ff6117b9cc1709228ec91db5d8ea85f0fab22 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -26,6 +26,23 @@ import six
 from pytools import Record
 
 
+def process_preambles(preambles):
+    seen_preamble_tags = set()
+    dedup_preambles = []
+
+    for tag, preamble in sorted(preambles, key=lambda tag_code: tag_code[0]):
+        if tag in seen_preamble_tags:
+            continue
+
+        seen_preamble_tags.add(tag)
+        dedup_preambles.append(preamble)
+
+    from loopy.tools import remove_common_indentation
+    return [
+            remove_common_indentation(lines) + "\n"
+            for lines in dedup_preambles]
+
+
 # {{{ code generation result
 
 class GeneratedProgram(Record):
@@ -65,6 +82,7 @@ class CodeGenerationResult(Record):
 
     .. automethod:: host_code
     .. automethod:: device_code
+    .. automethod:: all_code
 
     .. attribute:: implemented_data_info
 
@@ -96,7 +114,7 @@ class CodeGenerationResult(Record):
                 **kwargs)
 
     def host_code(self):
-        preamble_codes = getattr(self, "host_preambles", [])
+        preamble_codes = process_preambles(getattr(self, "host_preambles", []))
 
         return (
                 "".join(preamble_codes)
@@ -104,13 +122,27 @@ class CodeGenerationResult(Record):
                 str(self.host_program.ast))
 
     def device_code(self):
-        preamble_codes = getattr(self, "device_preambles", [])
+        preamble_codes = process_preambles(getattr(self, "device_preambles", []))
 
         return (
                 "".join(preamble_codes)
                 + "\n"
                 + "\n\n".join(str(dp.ast) for dp in self.device_programs))
 
+    def all_code(self):
+        preamble_codes = process_preambles(
+                getattr(self, "host_preambles", [])
+                +
+                getattr(self, "device_preambles", [])
+                )
+
+        return (
+                "".join(preamble_codes)
+                + "\n"
+                + "\n\n".join(str(dp.ast) for dp in self.device_programs)
+                + "\n\n"
+                + str(self.host_program.ast))
+
     def current_program(self, codegen_state):
         if codegen_state.is_generating_device_code:
             if self.device_programs:
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index aa50f03cb63eaa93a2e807f90c8dd08cd3ab4f4d..51b5f5343b90ac644d9f1d222637bf428d833d6e 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -732,7 +732,7 @@ class ExpressionToCMapper(RecursiveMapper):
         raise LoopyError("plain C does not have group hw axes")
 
     def map_local_hw_index(self, expr, enclosing_prec, type_context):
-        raise LoopyError("plain C does not have group hw axes")
+        raise LoopyError("plain C does not have local hw axes")
 
 # }}}
 
diff --git a/loopy/target/numba.py b/loopy/target/numba.py
new file mode 100644
index 0000000000000000000000000000000000000000..95c1de08c9ef90bda6438d613e45e0515508573d
--- /dev/null
+++ b/loopy/target/numba.py
@@ -0,0 +1,220 @@
+"""Python host AST builder for integration with PyOpenCL."""
+
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2016 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+from pytools import memoize_method
+
+from loopy.target.python import ExpressionToPythonMapper, PythonASTBuilderBase
+from loopy.target import TargetBase, DummyHostASTBuilder
+
+from loopy.diagnostic import LoopyWarning
+
+
+# {{{ base numba
+
+def _base_numba_preamble_generator(preamble_info):
+    yield ("06_numba_imports", """
+            import numba as _lpy_numba
+            """)
+
+
+class NumbaBaseASTBuilder(PythonASTBuilderBase):
+    def preamble_generators(self):
+        return (
+                super(NumbaBaseASTBuilder, self).preamble_generators() + [
+                    _base_numba_preamble_generator
+                    ])
+
+    def get_function_definition(self, codegen_state, codegen_result,
+            schedule_index,
+            function_decl, function_body):
+
+        assert function_decl is None
+
+        from genpy import Function
+        return Function(
+                codegen_result.current_program(codegen_state).name,
+                [idi.name for idi in codegen_state.implemented_data_info],
+                function_body,
+                decorators=self.get_python_function_decorators())
+
+    def get_python_function_decorators(self):
+        return ()
+
+    def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
+        from pymbolic.mapper.stringifier import PREC_NONE
+        from genpy import Statement
+
+        ecm = self.get_expression_to_code_mapper(codegen_state)
+        implemented_data_info = codegen_state.implemented_data_info
+
+        return Statement(
+            "%s[%s, %s](%s)" % (
+                name,
+                ecm(gsize, PREC_NONE),
+                ecm(lsize, PREC_NONE),
+                ", ".join(idi.name for idi in implemented_data_info)
+                ))
+
+
+class NumbaJITASTBuilder(NumbaBaseASTBuilder):
+    def get_python_function_decorators(self):
+        return ("@_lpy_numba.jit",)
+
+
+class NumbaTarget(TargetBase):
+    """A target for plain Python as understood by Numba, without any parallel extensions.
+    """
+
+    def __init__(self):
+        from warnings import warn
+        warn("The Numba targets are not yet feature-complete",
+                LoopyWarning, stacklevel=2)
+
+    def split_kernel_at_global_barriers(self):
+        return False
+
+    def get_host_ast_builder(self):
+        return DummyHostASTBuilder(self)
+
+    def get_device_ast_builder(self):
+        return NumbaJITASTBuilder(self)
+
+    # {{{ types
+
+    @memoize_method
+    def get_dtype_registry(self):
+        from loopy.target.c import DTypeRegistryWrapper
+        from loopy.target.c.compyte.dtypes import (
+                DTypeRegistry, fill_registry_with_c_types)
+        result = DTypeRegistry()
+        fill_registry_with_c_types(result, respect_windows=False,
+                include_bool=True)
+        return DTypeRegistryWrapper(result)
+
+    def is_vector_dtype(self, dtype):
+        return False
+
+    def get_vector_dtype(self, base, count):
+        raise KeyError()
+
+    def get_or_register_dtype(self, names, dtype=None):
+        # These kind of shouldn't be here.
+        return self.get_dtype_registry().get_or_register_dtype(names, dtype)
+
+    def dtype_to_typename(self, dtype):
+        # These kind of shouldn't be here.
+        return self.get_dtype_registry().dtype_to_ctype(dtype)
+
+    # }}}
+
+# }}}
+
+
+# {{{ numba.cuda
+
+class NumbaCudaExpressionToPythonMapper(ExpressionToPythonMapper):
+    _GRID_AXES = "xyz"
+
+    def map_group_hw_index(self, expr, enclosing_prec):
+        return "_lpy_ncu.blockIdx.%s" % self._GRID_AXES[expr.axis]
+
+    def map_local_hw_index(self, expr, enclosing_prec):
+        return "_lpy_ncu.threadIdx.%s" % self._GRID_AXES[expr.axis]
+
+
+def _cuda_numba_preamble_generator(preamble_info):
+    yield ("06_import_numba_cuda", """
+            import numba.cuda as _lpy_ncu
+            """)
+
+
+class NumbaCudaASTBuilder(NumbaBaseASTBuilder):
+    def preamble_generators(self):
+        return (
+                super(NumbaCudaASTBuilder, self).preamble_generators() + [
+                    _cuda_numba_preamble_generator
+                    ])
+
+    def get_python_function_decorators(self):
+        return ("@_lpy_ncu.jit",)
+
+    def get_expression_to_code_mapper(self, codegen_state):
+        return NumbaCudaExpressionToPythonMapper(codegen_state)
+
+
+class NumbaCudaTarget(TargetBase):
+    """A target for plain Python, without any parallel extensions.
+    """
+
+    host_program_name_suffix = ""
+    device_program_name_suffix = "_inner"
+
+    def __init__(self):
+        from warnings import warn
+        warn("The Numba target is not yet feature-complete",
+                LoopyWarning, stacklevel=2)
+
+    def split_kernel_at_global_barriers(self):
+        return True
+
+    def get_host_ast_builder(self):
+        return NumbaBaseASTBuilder(self)
+
+    def get_device_ast_builder(self):
+        return NumbaCudaASTBuilder(self)
+
+    # {{{ types
+
+    @memoize_method
+    def get_dtype_registry(self):
+        from loopy.target.c import DTypeRegistryWrapper
+        from loopy.target.c.compyte.dtypes import (
+                DTypeRegistry, fill_registry_with_c_types)
+        result = DTypeRegistry()
+        fill_registry_with_c_types(result, respect_windows=False,
+                include_bool=True)
+        return DTypeRegistryWrapper(result)
+
+    def is_vector_dtype(self, dtype):
+        return False
+
+    def get_vector_dtype(self, base, count):
+        raise KeyError()
+
+    def get_or_register_dtype(self, names, dtype=None):
+        # These kind of shouldn't be here.
+        return self.get_dtype_registry().get_or_register_dtype(names, dtype)
+
+    def dtype_to_typename(self, dtype):
+        # These kind of shouldn't be here.
+        return self.get_dtype_registry().dtype_to_ctype(dtype)
+
+    # }}}
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/loopy/target/python.py b/loopy/target/python.py
index 83e8df12459dc0b79d0789341dfc1213c008e084..f2a529c34bc4c56f69db1693058308769919e27a 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -24,11 +24,14 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+import six
+import numpy as np
 
+from pymbolic.mapper import Mapper
 from pymbolic.mapper.stringifier import StringifyMapper
 from loopy.expression import TypeInferenceMapper
 from loopy.kernel.data import ValueArg
-from loopy.diagnostic import LoopyError
+from loopy.diagnostic import LoopyError  # noqa
 from loopy.target import ASTBuilderBase
 
 
@@ -43,6 +46,9 @@ class ExpressionToPythonMapper(StringifyMapper):
             type_inf_mapper = TypeInferenceMapper(self.kernel)
         self.type_inf_mapper = type_inf_mapper
 
+    def handle_unsupported_expression(self, victim, enclosing_prec):
+        return Mapper.handle_unsupported_expression(self, victim, enclosing_prec)
+
     def rec(self, expr, prec, type_context=None, needed_dtype=None):
         return super(ExpressionToPythonMapper, self).rec(expr, prec)
 
@@ -52,6 +58,12 @@ class ExpressionToPythonMapper(StringifyMapper):
         return repr(expr)
 
     def map_variable(self, expr, enclosing_prec):
+        if expr.name in self.codegen_state.var_subst_map:
+            # Unimplemented: annotate_inames
+            return str(self.rec(
+                self.codegen_state.var_subst_map[expr.name],
+                enclosing_prec))
+
         if expr.name in self.kernel.all_inames():
             return super(ExpressionToPythonMapper, self).map_variable(
                     expr, enclosing_prec)
@@ -61,24 +73,147 @@ class ExpressionToPythonMapper(StringifyMapper):
             return super(ExpressionToPythonMapper, self).map_variable(
                     expr, enclosing_prec)
 
-        raise LoopyError("may not refer to %s '%s' in host code"
-                % (type(var_descr).__name__, expr.name))
+        return super(ExpressionToPythonMapper, self).map_variable(
+                expr, enclosing_prec)
 
     def map_subscript(self, expr, enclosing_prec):
-        raise LoopyError("may not subscript '%s' in host code"
-                % expr.aggregate.name)
+        return super(ExpressionToPythonMapper, self).map_subscript(
+                expr, enclosing_prec)
+
+    def map_call(self, expr, enclosing_prec):
+        from pymbolic.primitives import Variable
+        from pymbolic.mapper.stringifier import PREC_NONE
+
+        identifier = expr.function
+
+        if identifier.name in ["indexof", "indexof_vec"]:
+            raise LoopyError(
+                    "indexof, indexof_vec not yet supported in Python")
+
+        if isinstance(identifier, Variable):
+            identifier = identifier.name
+
+        par_dtypes = tuple(self.type_inf_mapper(par) for par in expr.parameters)
+
+        str_parameters = None
+
+        mangle_result = self.kernel.mangle_function(
+                identifier, par_dtypes,
+                ast_builder=self.codegen_state.ast_builder)
+
+        if mangle_result is None:
+            raise RuntimeError("function '%s' unknown--"
+                    "maybe you need to register a function mangler?"
+                    % identifier)
+
+        if len(mangle_result.result_dtypes) != 1:
+            raise LoopyError("functions with more or fewer than one return value "
+                    "may not be used in an expression")
+
+        str_parameters = [
+                self.rec(par, PREC_NONE)
+                for par, par_dtype, tgt_dtype in zip(
+                    expr.parameters, par_dtypes, mangle_result.arg_dtypes)]
+
+        from loopy.codegen import SeenFunction
+        self.codegen_state.seen_functions.add(
+                SeenFunction(identifier,
+                    mangle_result.target_name,
+                    mangle_result.arg_dtypes or par_dtypes))
+
+        return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters))
+
+    def map_group_hw_index(self, expr, enclosing_prec):
+        raise LoopyError("plain Python does not have group hw axes")
+
+    def map_local_hw_index(self, expr, enclosing_prec):
+        raise LoopyError("plain Python does not have local hw axes")
 
 # }}}
 
 
 # {{{ ast builder
 
+def _numpy_single_arg_function_mangler(kernel, name, arg_dtypes):
+    if (not isinstance(name, str)
+            or not hasattr(np, name)
+            or len(arg_dtypes) != 1):
+        return None
+
+    arg_dtype, = arg_dtypes
+
+    from loopy.kernel.data import CallMangleInfo
+    return CallMangleInfo(
+            target_name="_lpy_np."+name,
+            result_dtypes=(arg_dtype,),
+            arg_dtypes=arg_dtypes)
+
+
+def _base_python_preamble_generator(preamble_info):
+    yield ("00_future", "from __future__ import division, print_function\n")
+    yield ("05_numpy_import", """
+            import numpy as _lpy_np
+            """)
+
+
 class PythonASTBuilderBase(ASTBuilderBase):
     """A Python host AST builder for integration with PyOpenCL.
     """
 
     # {{{ code generation guts
 
+    def function_manglers(self):
+        return (
+                super(PythonASTBuilderBase, self).function_manglers() + [
+                    _numpy_single_arg_function_mangler,
+                    ])
+
+    def preamble_generators(self):
+        return (
+                super(PythonASTBuilderBase, self).preamble_generators() + [
+                    _base_python_preamble_generator
+                    ])
+
+    def get_function_declaration(self, codegen_state, codegen_result,
+            schedule_index):
+        return None
+
+    def get_function_definition(self, codegen_state, codegen_result,
+            schedule_index,
+            function_decl, function_body):
+
+        assert function_decl is None
+
+        from genpy import Function
+        return Function(
+                codegen_result.current_program(codegen_state).name,
+                [idi.name for idi in codegen_state.implemented_data_info],
+                function_body)
+
+    def get_temporary_decls(self, codegen_state, schedule_index):
+        kernel = codegen_state.kernel
+        ecm = codegen_state.expression_to_code_mapper
+
+        result = []
+
+        from pymbolic.mapper.stringifier import PREC_NONE
+        from genpy import Assign
+
+        for tv in sorted(
+                six.itervalues(kernel.temporary_variables),
+                key=lambda tv: tv.name):
+            if tv.shape:
+                result.append(
+                        Assign(
+                            tv.name,
+                            "_lpy_np.empty(%s, dtype=%s)"
+                            % (
+                                ecm(tv.shape, PREC_NONE, "i"),
+                                "_lpy_np."+tv.dtype.numpy_dtype.name
+                                )))
+
+        return result
+
     def get_expression_to_code_mapper(self, codegen_state):
         return ExpressionToPythonMapper(codegen_state)
 
@@ -121,6 +256,10 @@ class PythonASTBuilderBase(ASTBuilderBase):
         from genpy import If
         return If(condition_str, ast)
 
+    def emit_assignment(self, codegen_state, lhs, rhs):
+        from genpy import Assign
+        return Assign(lhs, rhs)
+
     # }}}
 
 # }}}
diff --git a/test/test_target.py b/test/test_target.py
index 67204d9c1fbc02624eebf230b87d33c114c21ef1..e8548ccca6732f2fac6ea84d98d5b06f1d472c6c 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -193,6 +193,37 @@ def test_clamp(ctx_factory):
     evt, (out,) = knl(queue, x=x, a=np.float32(12), b=np.float32(15))
 
 
+def test_numba_target():
+    knl = lp.make_kernel(
+        "{[i,j,k]: 0<=i,j<M and 0<=k<N}",
+        "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))",
+        target=lp.NumbaTarget())
+
+    knl = lp.add_and_infer_dtypes(knl, {"X": np.float32})
+
+    print(lp.generate_code_v2(knl).device_code())
+
+
+def test_numba_cuda_target():
+    knl = lp.make_kernel(
+        "{[i,j,k]: 0<=i,j<M and 0<=k<N}",
+        "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))",
+        target=lp.NumbaCudaTarget())
+
+    knl = lp.assume(knl, "M>0")
+    knl = lp.split_iname(knl, "i", 16, outer_tag='g.0')
+    knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1))
+    knl = lp.add_prefetch(knl, "X[i,:]")
+    knl = lp.fix_parameters(knl, N=3)
+    knl = lp.set_loop_priority(knl, "i_inner,j_outer")
+    knl = lp.tag_inames(knl, "k:unr")
+    knl = lp.tag_array_axes(knl, "X", "N0,N1")
+
+    knl = lp.add_and_infer_dtypes(knl, {"X": np.float32})
+
+    print(lp.generate_code_v2(knl).all_code())
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])