diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 9b1e6b9f3cdce8011d1aaacaf3806b7c37d6cedb..4cdad2ad45ce2ba57ba866dabaf69690ba43c273 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -306,7 +306,8 @@ argument:
     ...     """
     ...     out[j,i] = a[i,j]
     ...     out[i,j] = 2*out[i,j]
-    ...     """)
+    ...     """,
+    ...     [lp.GlobalArg("out", shape=lp.auto, is_input=False), ...])
 
 loopy's programming model is completely *unordered* by default. This means
 that:
@@ -333,7 +334,9 @@ an explicit dependency:
     ...     """
     ...     out[j,i] = a[i,j] {id=transpose}
     ...     out[i,j] = 2*out[i,j]  {dep=transpose}
-    ...     """, name="transpose_and_dbl")
+    ...     """,
+    ...     [lp.GlobalArg("out", shape=lp.auto, is_input=False), ...],
+    ...     name="transpose_and_dbl")
 
 ``{id=transpose}`` assigns the identifier *transpose* to the first
 instruction, and ``{dep=transpose}`` declares a dependency of the second
@@ -394,7 +397,7 @@ Let us take a look at the generated code for the above kernel:
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) transpose_and_dbl(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) transpose_and_dbl(__global float *__restrict__ out, __global float const *__restrict__ a, int const n)
     {
       for (int i = 0; i <= -1 + n; ++i)
         for (int j = 0; j <= -1 + n; ++j)
@@ -430,7 +433,8 @@ with identical bounds, for the use of the transpose:
     ...     """
     ...     out[j,i] = a[i,j] {id=transpose}
     ...     out[ii,jj] = 2*out[ii,jj]  {dep=transpose}
-    ...     """)
+    ...     """,
+    ...     [lp.GlobalArg("out", shape=lp.auto, is_input=False), ...])
     >>> knl = lp.prioritize_loops(knl, "i,j,ii,jj")
 
 :func:`loopy.duplicate_inames` can be used to achieve the same goal.
@@ -443,7 +447,7 @@ Now the intended code is generated and our test passes.
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float *__restrict__ out, __global float const *__restrict__ a, int const n)
     {
       for (int i = 0; i <= -1 + n; ++i)
         for (int j = 0; j <= -1 + n; ++j)
diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py
index a270fcdb89fb362ac3776adfd3fac4ab0b8cffec..81f1e0c11d2e089deac005174393471a8208bb34 100644
--- a/loopy/target/c/c_execution.py
+++ b/loopy/target/c/c_execution.py
@@ -497,6 +497,9 @@ class CKernelExecutor(KernelExecutorBase):
         """
         assert entrypoint is not None
 
+        if __debug__:
+            self.check_for_required_array_arguments(kwargs.keys())
+
         if self.packing_controller is not None:
             kwargs = self.packing_controller(kwargs)
 
diff --git a/loopy/target/execution.py b/loopy/target/execution.py
index b7a223968cc726b2cb85a595e11b3ce482b02d1b..1a5744230fc6f73f51b26e918d37255af184326e 100644
--- a/loopy/target/execution.py
+++ b/loopy/target/execution.py
@@ -734,12 +734,32 @@ class KernelExecutorBase:
 
         self.packing_controller = make_packing_controller(program, entrypoint)
 
-        self.output_names = tuple(arg.name for arg in self.program[entrypoint].args
-                if arg.is_output)
+        kernel = self.program[entrypoint]
+        self.output_names = set(arg.name for arg in kernel.args if arg.is_output)
+
+        from loopy import ArrayArg
+        self.input_array_names = set(
+            arg.name for arg in kernel.args
+            if arg.is_input and isinstance(arg, ArrayArg))
 
         self.has_runtime_typed_args = any(
-                arg.dtype is None
-                for arg in program[entrypoint].args)
+            arg.dtype is None for arg in kernel.args)
+
+    def check_for_required_array_arguments(self, input_args):
+        # Formerly, the first exception raised when a required argument is not
+        # passed was often at type inference. This exists to raise a more meaningful
+        # message in such scenarios. Since type inference precedes compilation, this
+        # check cannot be deferred to the generated invoker code.
+        # See discussion at github.com/inducer/loopy/pull/160#issuecomment-867761204
+        # and links therin for context.
+        if not self.input_array_names <= set(input_args):
+            missing_args = self.input_array_names - set(input_args)
+            kernel = self.program[self.entrypoint]
+            raise LoopyError(
+                f"Kernel {kernel.name}() missing required array input arguments: "
+                f"{', '.join(missing_args)}. "
+                "If this is a surprise, maybe you need to add is_input=False to "
+                "you argument.")
 
     def get_typed_and_scheduled_translation_unit_uncached(
             self, entrypoint, arg_to_dtype_set):
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index 1366ef8594ede156ebe045926fa6b43a0f613a09..1b90add01b10ea2cb5abab8a7e0e5f76cb063aa7 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -355,6 +355,9 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
 
         assert entrypoint is not None
 
+        if __debug__:
+            self.check_for_required_array_arguments(kwargs.keys())
+
         if self.packing_controller is not None:
             kwargs = self.packing_controller(kwargs)
 
diff --git a/test/test_expression.py b/test/test_expression.py
index 1aca17a9d1e183591724bfed2cdd2c1f18b7ba5d..52df3e3054c442209a30c21784a824e663ce5214 100644
--- a/test/test_expression.py
+++ b/test/test_expression.py
@@ -399,7 +399,9 @@ def test_indexof(ctx_factory):
 
     knl = lp.make_kernel(
          """ { [i,j]: 0<=i,j<5 } """,
-         """ out[i,j] = indexof(out[i,j])""")
+         """ out[i,j] = indexof(out[i,j])""",
+         [lp.GlobalArg("out", is_input=False, shape=lp.auto)]
+    )
 
     knl = lp.set_options(knl, write_cl=True)
 
@@ -508,6 +510,11 @@ def test_complex_support(ctx_factory, target):
             out_sum = sum(i1, 1.0*i1 + i1*1jf)*sum(i2, 1.0*i2 + i2*1jf)
             conj_out_sum = conj(out_sum)
             """,
+            [
+                lp.GlobalArg("out_sum, euler1, real_plus_complex",
+                            is_input=False, shape=lp.auto),
+                ...
+            ],
             target=target(),
             seq_dependencies=True)
     knl = lp.set_options(knl, "return_dict")
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 54b94d8fcf86401279b5e8e844ea23e0560a656a..739dbae716bdad59374217160f365bd7bdba7556 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -62,10 +62,12 @@ def test_globals_decl_once_with_multi_subprogram(ctx_factory):
             out[i] = a[i]+cnst[i]{id=first}
             out[ii] = 2*out[ii]+cnst[ii]{id=second}
             """,
-            [lp.TemporaryVariable(
-                "cnst", initializer=cnst,
-                scope=lp.AddressSpace.GLOBAL,
-                read_only=True), "..."])
+            [
+                lp.TemporaryVariable(
+                    "cnst", initializer=cnst, scope=lp.AddressSpace.GLOBAL,
+                    read_only=True),
+                lp.GlobalArg("out", is_input=False, shape=lp.auto),
+                "..."])
     knl = lp.fix_parameters(knl, n=16)
     knl = lp.add_barrier(knl, "id:first", "id:second")
 
@@ -1472,7 +1474,11 @@ def test_finite_difference_expr_subst(ctx_factory):
     fin_diff_knl = lp.make_kernel(
         "{[i]: 1<=i<=n}",
         "out[i] = -(f[i+1] - f[i-1])/h",
-        [lp.GlobalArg("out", shape="n+2"), "..."])
+        [
+            lp.GlobalArg("out", shape="n+2"),
+            lp.GlobalArg("f", is_input=False, is_output=True, shape=lp.auto),
+            "..."
+        ])
 
     flux_knl = lp.make_kernel(
         "{[j]: 1<=j<=n}",
diff --git a/test/test_target.py b/test/test_target.py
index 585b0055a014cbe4a8ac632c38c43c8763495951..4c2eef620d47d870326e7d5d4756b58bbb182f1b 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -20,6 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from loopy.diagnostic import LoopyError
 import sys
 import numpy as np
 import loopy as lp
@@ -424,6 +425,7 @@ def test_nan_support(ctx_factory):
         [lp.Assignment(parse("a"), np.nan),
          lp.Assignment(parse("b"), parse("isnan(a)")),
          lp.Assignment(parse("c"), parse("isnan(3.14)"))],
+        [lp.GlobalArg("a", is_input=False, shape=tuple()), ...],
         seq_dependencies=True)
 
     knl = lp.set_options(knl, "return_dict")
@@ -521,6 +523,49 @@ def test_inf_support(ctx_factory, target, dtype):
     assert np.isneginf(out_dict["out_neginf"])
 
 
+def test_input_args_are_required(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl1 = lp.make_kernel(
+        "{ [i]: 0<=i<2 }",
+        """
+        g[i] = f[i] + 1.5
+        """,
+        [lp.GlobalArg("f, g", dtype="float64"), ...]
+    )
+
+    knl2 = lp.make_kernel(
+        "{ [i]: 0<=i<n }",
+        "g[i] = 3 * f[i] + g[i]",
+    )
+
+    f = np.zeros(2)
+    g = np.zeros(2)
+
+    for knl in [knl1, knl2]:
+        with pytest.raises(LoopyError):
+            _ = knl(queue)
+            _ = knl(queue, g=g)
+
+    _ = knl1(queue, f=f)
+    _ = knl1(queue, f=f, g=g)
+
+    knl = lp.make_kernel(
+        "{ [i]: 0<=i<2 }",
+        """
+        f[i] = 3.
+        g[i] = f[i] + 1.5
+        """,
+        [lp.GlobalArg("f, g", dtype="float64"), ...]
+    )
+
+    # FIXME: this should not raise!
+    # https://github.com/inducer/loopy/issues/450
+    with pytest.raises(LoopyError):
+        _ = knl(queue)
+
+
 def test_pyopencl_execution_accepts_device_scalars(ctx_factory):
     import pyopencl.array as cla
 
diff --git a/test/test_transform.py b/test/test_transform.py
index 79e588cd67a911579db8bc4af6f5d6ca94045342..51e7c263667c69235d01bce705a51f98e8d14aa3 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -176,7 +176,12 @@ def test_add_barrier(ctx_factory):
             """
             out[j, i] = a[i, j]{id=transpose}
             out[ii, jj] = 2*out[ii, jj]{id=double}
-            """)
+            """,
+            [
+                lp.GlobalArg("out", is_input=False, shape=lp.auto),
+                ...
+            ]
+    )
     a = np.random.randn(16, 16)
     knl = lp.add_barrier(knl, "id:transpose", "id:double", "gb1")