diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 9b1e6b9f3cdce8011d1aaacaf3806b7c37d6cedb..4cdad2ad45ce2ba57ba866dabaf69690ba43c273 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -306,7 +306,8 @@ argument: ... """ ... out[j,i] = a[i,j] ... out[i,j] = 2*out[i,j] - ... """) + ... """, + ... [lp.GlobalArg("out", shape=lp.auto, is_input=False), ...]) loopy's programming model is completely *unordered* by default. This means that: @@ -333,7 +334,9 @@ an explicit dependency: ... """ ... out[j,i] = a[i,j] {id=transpose} ... out[i,j] = 2*out[i,j] {dep=transpose} - ... """, name="transpose_and_dbl") + ... """, + ... [lp.GlobalArg("out", shape=lp.auto, is_input=False), ...], + ... name="transpose_and_dbl") ``{id=transpose}`` assigns the identifier *transpose* to the first instruction, and ``{dep=transpose}`` declares a dependency of the second @@ -394,7 +397,7 @@ Let us take a look at the generated code for the above kernel: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) transpose_and_dbl(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) transpose_and_dbl(__global float *__restrict__ out, __global float const *__restrict__ a, int const n) { for (int i = 0; i <= -1 + n; ++i) for (int j = 0; j <= -1 + n; ++j) @@ -430,7 +433,8 @@ with identical bounds, for the use of the transpose: ... """ ... out[j,i] = a[i,j] {id=transpose} ... out[ii,jj] = 2*out[ii,jj] {dep=transpose} - ... """) + ... """, + ... [lp.GlobalArg("out", shape=lp.auto, is_input=False), ...]) >>> knl = lp.prioritize_loops(knl, "i,j,ii,jj") :func:`loopy.duplicate_inames` can be used to achieve the same goal. @@ -443,7 +447,7 @@ Now the intended code is generated and our test passes. #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float *__restrict__ out, __global float const *__restrict__ a, int const n) { for (int i = 0; i <= -1 + n; ++i) for (int j = 0; j <= -1 + n; ++j) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index a270fcdb89fb362ac3776adfd3fac4ab0b8cffec..81f1e0c11d2e089deac005174393471a8208bb34 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -497,6 +497,9 @@ class CKernelExecutor(KernelExecutorBase): """ assert entrypoint is not None + if __debug__: + self.check_for_required_array_arguments(kwargs.keys()) + if self.packing_controller is not None: kwargs = self.packing_controller(kwargs) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index b7a223968cc726b2cb85a595e11b3ce482b02d1b..1a5744230fc6f73f51b26e918d37255af184326e 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -734,12 +734,32 @@ class KernelExecutorBase: self.packing_controller = make_packing_controller(program, entrypoint) - self.output_names = tuple(arg.name for arg in self.program[entrypoint].args - if arg.is_output) + kernel = self.program[entrypoint] + self.output_names = set(arg.name for arg in kernel.args if arg.is_output) + + from loopy import ArrayArg + self.input_array_names = set( + arg.name for arg in kernel.args + if arg.is_input and isinstance(arg, ArrayArg)) self.has_runtime_typed_args = any( - arg.dtype is None - for arg in program[entrypoint].args) + arg.dtype is None for arg in kernel.args) + + def check_for_required_array_arguments(self, input_args): + # Formerly, the first exception raised when a required argument is not + # passed was often at type inference. This exists to raise a more meaningful + # message in such scenarios. Since type inference precedes compilation, this + # check cannot be deferred to the generated invoker code. + # See discussion at github.com/inducer/loopy/pull/160#issuecomment-867761204 + # and links therin for context. + if not self.input_array_names <= set(input_args): + missing_args = self.input_array_names - set(input_args) + kernel = self.program[self.entrypoint] + raise LoopyError( + f"Kernel {kernel.name}() missing required array input arguments: " + f"{', '.join(missing_args)}. " + "If this is a surprise, maybe you need to add is_input=False to " + "you argument.") def get_typed_and_scheduled_translation_unit_uncached( self, entrypoint, arg_to_dtype_set): diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 1366ef8594ede156ebe045926fa6b43a0f613a09..1b90add01b10ea2cb5abab8a7e0e5f76cb063aa7 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -355,6 +355,9 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): assert entrypoint is not None + if __debug__: + self.check_for_required_array_arguments(kwargs.keys()) + if self.packing_controller is not None: kwargs = self.packing_controller(kwargs) diff --git a/test/test_expression.py b/test/test_expression.py index 1aca17a9d1e183591724bfed2cdd2c1f18b7ba5d..52df3e3054c442209a30c21784a824e663ce5214 100644 --- a/test/test_expression.py +++ b/test/test_expression.py @@ -399,7 +399,9 @@ def test_indexof(ctx_factory): knl = lp.make_kernel( """ { [i,j]: 0<=i,j<5 } """, - """ out[i,j] = indexof(out[i,j])""") + """ out[i,j] = indexof(out[i,j])""", + [lp.GlobalArg("out", is_input=False, shape=lp.auto)] + ) knl = lp.set_options(knl, write_cl=True) @@ -508,6 +510,11 @@ def test_complex_support(ctx_factory, target): out_sum = sum(i1, 1.0*i1 + i1*1jf)*sum(i2, 1.0*i2 + i2*1jf) conj_out_sum = conj(out_sum) """, + [ + lp.GlobalArg("out_sum, euler1, real_plus_complex", + is_input=False, shape=lp.auto), + ... + ], target=target(), seq_dependencies=True) knl = lp.set_options(knl, "return_dict") diff --git a/test/test_loopy.py b/test/test_loopy.py index 54b94d8fcf86401279b5e8e844ea23e0560a656a..739dbae716bdad59374217160f365bd7bdba7556 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -62,10 +62,12 @@ def test_globals_decl_once_with_multi_subprogram(ctx_factory): out[i] = a[i]+cnst[i]{id=first} out[ii] = 2*out[ii]+cnst[ii]{id=second} """, - [lp.TemporaryVariable( - "cnst", initializer=cnst, - scope=lp.AddressSpace.GLOBAL, - read_only=True), "..."]) + [ + lp.TemporaryVariable( + "cnst", initializer=cnst, scope=lp.AddressSpace.GLOBAL, + read_only=True), + lp.GlobalArg("out", is_input=False, shape=lp.auto), + "..."]) knl = lp.fix_parameters(knl, n=16) knl = lp.add_barrier(knl, "id:first", "id:second") @@ -1472,7 +1474,11 @@ def test_finite_difference_expr_subst(ctx_factory): fin_diff_knl = lp.make_kernel( "{[i]: 1<=i<=n}", "out[i] = -(f[i+1] - f[i-1])/h", - [lp.GlobalArg("out", shape="n+2"), "..."]) + [ + lp.GlobalArg("out", shape="n+2"), + lp.GlobalArg("f", is_input=False, is_output=True, shape=lp.auto), + "..." + ]) flux_knl = lp.make_kernel( "{[j]: 1<=j<=n}", diff --git a/test/test_target.py b/test/test_target.py index 585b0055a014cbe4a8ac632c38c43c8763495951..4c2eef620d47d870326e7d5d4756b58bbb182f1b 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -20,6 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from loopy.diagnostic import LoopyError import sys import numpy as np import loopy as lp @@ -424,6 +425,7 @@ def test_nan_support(ctx_factory): [lp.Assignment(parse("a"), np.nan), lp.Assignment(parse("b"), parse("isnan(a)")), lp.Assignment(parse("c"), parse("isnan(3.14)"))], + [lp.GlobalArg("a", is_input=False, shape=tuple()), ...], seq_dependencies=True) knl = lp.set_options(knl, "return_dict") @@ -521,6 +523,49 @@ def test_inf_support(ctx_factory, target, dtype): assert np.isneginf(out_dict["out_neginf"]) +def test_input_args_are_required(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl1 = lp.make_kernel( + "{ [i]: 0<=i<2 }", + """ + g[i] = f[i] + 1.5 + """, + [lp.GlobalArg("f, g", dtype="float64"), ...] + ) + + knl2 = lp.make_kernel( + "{ [i]: 0<=i