diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 1d304fd2447425785b99b17adaccf14e832bc3a7..ababfbb35904f1ad8d1ff1e71f2d041ded373800 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -583,7 +583,8 @@ class Program:
     def compile(self, options=[], devices=None, headers=[]):
         options_bytes, _ = self._process_build_options(self._context, options)
 
-        self._get_prg().compile(options_bytes, devices, headers)
+        self._get_prg().compile(options_bytes, devices,
+                [(name, prg._get_prg()) for name, prg in headers])
         return self
 
     def __eq__(self, other):
diff --git a/test/test_wrapper.py b/test/test_wrapper.py
index 1068779b78d443cd0f3fe012c429025d82057d6d..debcb2b405c62f6828b8d5fa4efb35dbb3ddb865 100644
--- a/test/test_wrapper.py
+++ b/test/test_wrapper.py
@@ -1146,14 +1146,22 @@ def test_compile_link(ctx_factory):
         {
         }
         """).compile()
+    pi_h__prg = cl.Program(ctx, """//CL//
+        inline float get_pi()
+        {
+            return 3.1415f;
+        }
+        """).compile()
     main_prg = cl.Program(ctx, """//CL//
+        #include "pi.h"
+
         void value_sink(float x);
 
         __kernel void experiment()
         {
-            value_sink(3.1415f + get_global_id(0));
+            value_sink(get_pi() + get_global_id(0));
         }
-        """).compile()
+        """).compile(headers=[("pi.h", pi_h__prg)])
     z = cl.link_program(ctx, [vsink_prg, main_prg], devices=ctx.devices)
     z.experiment(queue, (128**2,), (128,))
     queue.finish()