diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst
index f16f8bfdbb26b716af27762d8502bff592496d7c..5609fc25328ed5b0a243eaa8d33c4d1aa6b3eddc 100644
--- a/doc/ref_transform.rst
+++ b/doc/ref_transform.rst
@@ -119,6 +119,8 @@ Finishing up
 
 .. autofunction:: generate_code_v2
 
+.. autofunction:: generate_header
+
 Setting options
 ---------------
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 9b6071105e31045c7267891f62a153a4da343b18..c12d7318c24e4ca1edb327f9710ae2182e5491b0 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -119,7 +119,7 @@ from loopy.statistics import (get_op_poly, sum_ops_to_dtypes,
         gather_access_footprints, gather_access_footprint_bytes)
 from loopy.codegen import (
         PreambleInfo,
-        generate_code, generate_code_v2, generate_body, generate_header)
+        generate_code, generate_code_v2, generate_body)
 from loopy.codegen.result import (
         GeneratedProgram,
         CodeGenerationResult)
@@ -130,7 +130,7 @@ from loopy.frontend.fortran import (c_preprocess, parse_transformed_fortran,
         parse_fortran)
 
 from loopy.target import TargetBase, ASTBuilderBase
-from loopy.target.c import CTarget
+from loopy.target.c import CTarget, generate_header
 from loopy.target.cuda import CudaTarget
 from loopy.target.opencl import OpenCLTarget
 from loopy.target.pyopencl import PyOpenCLTarget
@@ -238,7 +238,9 @@ __all__ = [
 
         "LoopyError", "LoopyWarning",
 
-        "TargetBase", "CTarget", "CudaTarget", "OpenCLTarget",
+        "TargetBase",
+        "CTarget", "generate_header",
+        "CudaTarget", "OpenCLTarget",
         "PyOpenCLTarget", "ISPCTarget",
         "NumbaTarget", "NumbaCudaTarget",
         "ASTBuilderBase",
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index c42fd2339c866cf806f51b5451e85164232f9a66..ffd291d530898a817b7be66a790d5d57fb88a99d 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -550,21 +550,4 @@ def generate_body(kernel):
 
 # }}}
 
-def generate_header(kernel):
-    try:
-        fde = kernel.target.get_device_decl_extractor()
-    except NotImplementedError:
-        from warnings import warn
-        warn('Header generation for non C-based languages are not implemented',
-                RuntimeWarning, stacklevel=2)
-        return None
-
-    codegen_result = generate_code_v2(kernel)
-
-    headers = []
-    for dev_prg in codegen_result.device_programs:
-        headers.append(str(fde(dev_prg.decl_ast)))
-
-    return '\n'.join(headers)
-
 # vim: foldmethod=marker
diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index 29280ac2b901ddecc0843a8f8fe97a5981c7cdf8..04fab05afdc38a8843a566e0e6e6b10098d6415c 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -61,10 +61,6 @@ class GeneratedProgram(Record):
         Once generated, this captures the AST of the operative function
         body (including declaration of necessary temporaries), but not
         the overall function definition.
-    .. attribute:: decl_ast
-        Once generated, this captures the AST of the operative function
-        definition but not the function body or declaration of necessary
-        temporaries.
     """
 
 
@@ -314,7 +310,6 @@ def generate_host_or_device_program(codegen_state, schedule_index):
             codegen_state,
             cur_prog.copy(
                 ast=ast_builder.process_ast(fdef_ast),
-                decl_ast=ast_builder.process_ast(fdecl_ast),
                 body_ast=ast_builder.process_ast(body_ast)))
 
     return codegen_result
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index fc80fc3d9c0382fa17bc0ebaf61049949dc3e977..409b9badb639c500e70404e781036b2e39bf333f 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -138,13 +138,6 @@ class TargetBase(object):
         """
         raise NotImplementedError()
 
-    def get_device_decl_extractor(self):
-        """
-        :returns: a FunctionDeclExtractor (if implemented) that extracts a device function's
-            header definition
-        """
-        raise NotImplementedError()
-
 
 class ASTBuilderBase(object):
     """An interface for generating (host or device) ASTs.
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 513386a58d5b933c20dfc95879863d00e1f4dcb4..8b81efb312d996a2cd972a416ddf7d190e580d13 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -29,7 +29,7 @@ import six
 import numpy as np  # noqa
 from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder
 from loopy.diagnostic import LoopyError
-from cgen import Pointer
+from cgen import Pointer, NestedDeclarator
 from cgen.mapper import IdentityMapper as CASTIdentityMapperBase
 from pymbolic.mapper.stringifier import PREC_NONE
 from loopy.symbolic import IdentityMapper
@@ -132,6 +132,10 @@ class POD(Declarator):
 
     mapper_method = "map_loopy_pod"
 
+
+class FunctionDeclarationWrapper(NestedDeclarator):
+    mapper_method = "map_function_decl_wrapper"
+
 # }}}
 
 
@@ -203,6 +207,10 @@ class CASTIdentityMapper(CASTIdentityMapperBase):
     def map_loopy_pod(self, node, *args, **kwargs):
         return type(node)(node.ast_builder, node.dtype, node.name)
 
+    def map_function_decl_wrapper(self, node, *args, **kwargs):
+        return FunctionDeclarationWrapper(
+                self.rec(node.subdecl, *args, **kwargs))
+
 
 class SubscriptSubsetCounter(IdentityMapper):
     def __init__(self, subset_counters):
@@ -259,9 +267,6 @@ class CTarget(TargetBase):
     def get_device_ast_builder(self):
         return CASTBuilder(self)
 
-    def get_device_decl_extractor(self):
-        return CFunctionDeclExtractor()
-
     # {{{ types
 
     @memoize_method
@@ -381,10 +386,11 @@ class CASTBuilder(ASTBuilderBase):
         if self.target.fortran_abi:
             name += "_"
 
-        return FunctionDeclaration(
-                        Value("void", name),
-                        [self.idi_to_cgen_declarator(codegen_state.kernel, idi)
-                            for idi in codegen_state.implemented_data_info])
+        return FunctionDeclarationWrapper(
+                FunctionDeclaration(
+                    Value("void", name),
+                    [self.idi_to_cgen_declarator(codegen_state.kernel, idi)
+                        for idi in codegen_state.implemented_data_info]))
 
     def get_temporary_decls(self, codegen_state, schedule_index):
         from loopy.kernel.data import temp_var_scope
@@ -756,12 +762,46 @@ class CASTBuilder(ASTBuilderBase):
         sc(node)
         return node
 
+
+# {{{ header generation
+
 class CFunctionDeclExtractor(CASTIdentityMapper):
     def __init__(self):
         self.decls = []
 
-    def map_function_declaration(self, node):
-        self.decls.append(node)
-        return super(self.__class__, self).map_function_declaration(node)
+    def map_expression(self, expr):
+        return expr
+
+    def map_function_decl_wrapper(self, node):
+        self.decls.append(node.subdecl)
+        return super(CFunctionDeclExtractor, self)\
+                .map_function_decl_wrapper(node)
+
+
+def generate_header(kernel, codegen_result=None):
+    """
+    :arg kernel: a :class:`loopy.LoopKernel`
+    :arg codegen_result: an instance of :class:`loopy.CodeGenerationResult`
+    :returns: a list of AST nodes (which may have :func:`str`
+        called on them to produce a string) representing
+        function declarations for the generated device
+        functions.
+    """
+
+    if not isinstance(kernel.target, CTarget):
+        raise LoopyError(
+                'Header generation for non C-based languages are not implemented')
+
+    if codegen_result is None:
+        from loopy.codegen import generate_code_v2
+        codegen_result = generate_code_v2(kernel)
+
+    fde = CFunctionDeclExtractor()
+    for dev_prg in codegen_result.device_programs:
+        fde(dev_prg.ast)
+
+    return fde.decls
+
+# }}}
 
 # vim: foldmethod=marker
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index d31718f15ed563bba0b602e6017536b72b6deed0..51ccc301290460c49af73691f2ebe812b8a6da9b 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -99,6 +99,7 @@ def _create_vector_types():
             vec.types[np.dtype(base_type), count] = dtype
             vec.type_to_scalar_and_count[dtype] = np.dtype(base_type), count
 
+
 _create_vector_types()
 
 
@@ -232,6 +233,10 @@ class CUDACASTBuilder(CASTBuilder):
         fdecl = super(CUDACASTBuilder, self).get_function_declaration(
                 codegen_state, codegen_result, schedule_index)
 
+        from loopy.target.c import FunctionDeclarationWrapper
+        assert isinstance(fdecl, FunctionDeclarationWrapper)
+        fdecl = fdecl.subdecl
+
         from cgen.cuda import CudaGlobal, CudaLaunchBounds
         fdecl = CudaGlobal(fdecl)
 
@@ -254,7 +259,7 @@ class CUDACASTBuilder(CASTBuilder):
 
             fdecl = CudaLaunchBounds(nthreads, fdecl)
 
-        return fdecl
+        return FunctionDeclarationWrapper(fdecl)
 
     def generate_code(self, kernel, codegen_state, impl_arg_info):
         code, implemented_domains = (
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 2c48fb902e746780599f039a44b2d0a5ea787b88..80a69bd00c99258b709ea18b2a716c339b888b02 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -237,16 +237,19 @@ class ISPCASTBuilder(CASTBuilder):
         arg_names, arg_decls = self._arg_names_and_decls(codegen_state)
 
         if codegen_state.is_generating_device_code:
-            return ISPCTask(
+            result = ISPCTask(
                         FunctionDeclaration(
                             Value("void", name),
                             arg_decls))
         else:
-            return ISPCExport(
+            result = ISPCExport(
                     FunctionDeclaration(
                         Value("void", name),
                         arg_decls))
 
+        from loopy.target.c import FunctionDeclarationWrapper
+        return FunctionDeclarationWrapper(result)
+
     # }}}
 
     def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index f0436099c6127e6426b03df2c48342b6ee99c67f..31cf7c6b648ebf370a17d8beb2538b9748ddb30a 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -126,6 +126,7 @@ def _create_vector_types():
             vec.types[np.dtype(base_type), count] = dtype
             vec.type_to_scalar_and_count[dtype] = np.dtype(base_type), count
 
+
 _create_vector_types()
 
 
@@ -400,6 +401,10 @@ class OpenCLCASTBuilder(CASTBuilder):
         fdecl = super(OpenCLCASTBuilder, self).get_function_declaration(
                 codegen_state, codegen_result, schedule_index)
 
+        from loopy.target.c import FunctionDeclarationWrapper
+        assert isinstance(fdecl, FunctionDeclarationWrapper)
+        fdecl = fdecl.subdecl
+
         from cgen.opencl import CLKernel, CLRequiredWorkGroupSize
         fdecl = CLKernel(fdecl)
 
@@ -415,7 +420,7 @@ class OpenCLCASTBuilder(CASTBuilder):
 
             fdecl = CLRequiredWorkGroupSize(local_sizes, fdecl)
 
-        return fdecl
+        return FunctionDeclarationWrapper(fdecl)
 
     def generate_top_of_body(self, codegen_state):
         from loopy.kernel.data import ImageArg
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 9f1b6ea8e93a62f147aaa67f70b563137766c404..69e0ea325a328d03bbd98cec2163f5c7981a4a78 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -1586,9 +1586,8 @@ def test_temp_initializer(ctx_factory, src_order, tmp_order):
 
     assert np.array_equal(a, a2)
 
-def test_header_extract(ctx_factory):
-    ctx = ctx_factory()
 
+def test_header_extract():
     knl = lp.make_kernel('{[k]: 0<=k<n}}',
          """
          for k
@@ -1601,19 +1600,22 @@ def test_header_extract(ctx_factory):
     knl = lp.fix_parameters(knl, n=200)
 
     #test C
-    cknl = knl
-    cknl.target = lp.CTarget()
-    assert lp.generate_header(cknl) == 'void loopy_kernel(float *restrict T);'
+    cknl = knl.copy(target=lp.CTarget())
+    assert str(lp.generate_header(cknl)[0]) == (
+            'void loopy_kernel(float *__restrict__ T);')
 
     #test CUDA
-    cuknl = knl
-    cuknl.target = lp.CudaTarget()
-    assert lp.generate_header(cuknl) == 'extern "C" __global__ void __launch_bounds__(1) loopy_kernel(float *__restrict__ T);'
+    cuknl = knl.copy(target=lp.CudaTarget())
+    assert str(lp.generate_header(cuknl)[0]) == (
+            'extern "C" __global__ void __launch_bounds__(1) '
+            'loopy_kernel(float *__restrict__ T);')
 
     #test OpenCL
-    oclknl = knl
-    oclknl.target = lp.PyOpenCLTarget()
-    assert lp.generate_header(oclknl) == '__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float *restrict T);'
+    oclknl = knl.copy(target=lp.PyOpenCLTarget())
+    assert str(lp.generate_header(oclknl)[0]) == (
+            '__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) '
+            'loopy_kernel(__global float *__restrict__ T);')
+
 
 def test_scalars_with_base_storage(ctx_factory):
     """ Regression test for !50 """
@@ -1678,6 +1680,7 @@ def test_tight_loop_bounds_codegen():
 
     assert for_loop in cgr.device_code()
 
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])