From 8877a116026a3074a66f08bf42dd49c90679ce50 Mon Sep 17 00:00:00 2001
From: zachjweiner <zachjweiner@gmail.com>
Date: Sat, 8 Aug 2020 12:17:02 -0500
Subject: [PATCH 1/3] PyOpenCLExecutionWrapperGenerator: retain and write to
 numpy inputs

---
 loopy/target/pyopencl_execution.py | 31 ++++++++++++++++++-----------
 test/test_target.py                | 32 ++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index 05fdd21f0..3eb266166 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -65,6 +65,8 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
     def handle_non_numpy_arg(self, gen, arg):
         gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name)
         with Indentation(gen):
+            gen("# retain originally passed array")
+            gen("_lpy_%s_np_input = %s" % (arg.name, arg.name))
             gen("# synchronous, nothing to worry about")
             gen("%s = _lpy_cl_array.to_device("
                     "queue, %s, allocator=allocator)"
@@ -73,16 +75,20 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
         gen("elif %s is not None:" % arg.name)
         with Indentation(gen):
             gen("_lpy_encountered_dev = True")
+            gen("_lpy_%s_np_input = None" % arg.name)
+        gen("else:")
+        with Indentation(gen):
+            gen("_lpy_%s_np_input = None" % arg.name)
 
         gen("")
 
     # }}}
 
-    # {{{ handle allocation of unspecified arguements
+    # {{{ handle allocation of unspecified arguments
 
     def handle_alloc(self, gen, arg, kernel_arg, strify, skip_arg_checks):
         """
-        Handle allocation of non-specified arguements for pyopencl execution
+        Handle allocation of non-specified arguments for pyopencl execution
         """
         from pymbolic import var
 
@@ -200,16 +206,17 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
             with Indentation(gen):
                 gen("out_host = True")
 
-            gen("if out_host:")
-            with Indentation(gen):
-                gen("pass")  # if no outputs (?!)
-                for arg in implemented_data_info:
-                    if not issubclass(arg.arg_class, KernelArgument):
-                        continue
-
-                    is_written = arg.base_name in kernel.get_written_variables()
-                    if is_written:
-                        gen("%s = %s.get(queue=queue)" % (arg.name, arg.name))
+            for arg in implemented_data_info:
+                if not issubclass(arg.arg_class, KernelArgument):
+                    continue
+
+                is_written = arg.base_name in kernel.get_written_variables()
+                if is_written:
+                    np_name = "_lpy_%s_np_input" % arg.name
+                    gen("if out_host or %s is not None:" % np_name)
+                    with Indentation(gen):
+                        gen("%s = %s.get(queue=queue, ary=%s)"
+                            % (arg.name, arg.name, np_name))
 
             gen("")
 
diff --git a/test/test_target.py b/test/test_target.py
index afad1b676..e27f6a32a 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -378,6 +378,38 @@ def test_cuda_short_vector():
     print(lp.generate_code_v2(knl).device_code())
 
 
+def test_pyopencl_execution_numpy_handling(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    # test numpy input for x is written to and returned
+    knl = lp.make_kernel('{:}', ['x[0] = y[0] + x[0]'])
+
+    y = np.array([3.])
+    x = np.array([4.])
+    evt, out = knl(queue, y=y, x=x)
+    assert out[0] is x
+    assert x[0] == 7.
+
+    # test numpy input for x is written to and returned, even when a pyopencl array
+    # is passed for y
+    import pyopencl.array as cla
+    y = cla.zeros(queue, shape=(1), dtype='float64') + 3.
+    x = np.array([4.])
+    evt, out = knl(queue, y=y, x=x)
+    assert out[0] is x
+    assert x[0] == 7.
+
+    # test numpy input for x is written to and returned, even when output-only
+    knl = lp.make_kernel('{:}', ['x[0] = y[0] + 2'])
+
+    y = np.array([3.])
+    x = np.array([4.])
+    evt, out = knl(queue, y=y, x=x)
+    assert out[0] is x
+    assert x[0] == 5.
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
-- 
GitLab


From cf63c2620efc2511cedd10dff945611060a4ec66 Mon Sep 17 00:00:00 2001
From: zachjweiner <zachjweiner@gmail.com>
Date: Sat, 8 Aug 2020 12:19:39 -0500
Subject: [PATCH 2/3] make KernelExecutors demand all input arguments are
 passed; indexof tests will fail

---
 loopy/target/execution.py          | 14 +++++++++++-
 loopy/target/pyopencl_execution.py |  2 ++
 test/test_target.py                | 36 ++++++++++++++++++++++++++++++
 3 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/loopy/target/execution.py b/loopy/target/execution.py
index c8f0d4090..afc3341cf 100644
--- a/loopy/target/execution.py
+++ b/loopy/target/execution.py
@@ -385,6 +385,7 @@ class ExecutionWrapperGeneratorBase(object):
 
         for arg_idx, arg in enumerate(implemented_data_info):
             is_written = arg.base_name in kernel.get_written_variables()
+            is_read = arg.base_name in kernel.get_read_variables()
             kernel_arg = kernel.impl_arg_to_arg.get(arg.name)
 
             if not issubclass(arg.arg_class, KernelArgument):
@@ -432,7 +433,8 @@ class ExecutionWrapperGeneratorBase(object):
 
             # {{{ allocate written arrays, if needed
 
-            if is_written and arg.arg_class in [lp.ArrayArg, lp.ConstantArg] \
+            if is_written and not is_read \
+                    and arg.arg_class in [lp.ArrayArg, lp.ConstantArg] \
                     and arg.shape is not None \
                     and all(si is not None for si in arg.shape):
 
@@ -725,10 +727,20 @@ class KernelExecutorBase(object):
         self.output_names = tuple(arg.name for arg in self.kernel.args
                 if arg.name in self.kernel.get_written_variables())
 
+        self.input_names = tuple(arg.name for arg in self.kernel.args
+                if arg.name in self.kernel.get_read_variables())
+
         self.has_runtime_typed_args = any(
                 arg.dtype is None
                 for arg in kernel.args)
 
+    def check_for_required_arguments(self, input_args):
+        missing_args = set(self.input_names) - set(input_args)
+        if missing_args != set():
+            raise LoopyError(
+                "Kernel arguments '%s' are required but were not passed."
+                % missing_args)
+
     def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set):
         from loopy.kernel.tools import add_dtypes
 
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index 3eb266166..b57169bac 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -355,6 +355,8 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
 
         kwargs = self.packing_controller.unpack(kwargs)
 
+        self.check_for_required_arguments(kwargs.keys())
+
         kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs))
 
         return kernel_info.invoker(
diff --git a/test/test_target.py b/test/test_target.py
index e27f6a32a..5fc2fe2a3 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -410,6 +410,42 @@ def test_pyopencl_execution_numpy_handling(ctx_factory):
     assert x[0] == 5.
 
 
+def test_input_arguments_are_required(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    import pyopencl.array as cla
+
+    n = 2
+    x = cla.zeros(queue, (n,), 'float64') + 1.5
+    y = cla.zeros(queue, (n,), 'float64') + 2.
+
+    # make sure y is required even when y.is_output == True
+    knl = lp.make_kernel(
+        "{ [i]: 0<=i<n }",
+        "y[i] = 3*x[i] + y[i]",
+        fixed_parameters=dict(n=n),
+        options=dict(return_dict=True))
+
+    evt, out_dict = knl(queue, x=x, y=y)
+    assert y is out_dict['y']
+    assert (y.get() == 6.5).all()
+
+    from loopy import LoopyError
+    with pytest.raises(LoopyError):
+        evt, out_dict = knl(queue, x=x)
+
+    # make sure y is not required even when y.is_output_only == True
+    knl = lp.make_kernel(
+        "{ [i]: 0<=i<n }",
+        "y[i] = 3*x[i] + 2",
+        fixed_parameters=dict(n=n),
+        options=dict(return_dict=True))
+
+    evt, out_dict = knl(queue, x=x)
+    assert (out_dict['y'].get() == 6.5).all()
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
-- 
GitLab


From 7ee6a3b0669ad0af7d2c5c7e14cb9bf9c1aa26af Mon Sep 17 00:00:00 2001
From: zachjweiner <zachjweiner@gmail.com>
Date: Sun, 9 Aug 2020 09:18:16 -0500
Subject: [PATCH 3/3] Add is_input/is_output; restrict passed arg check to
 ArrayArgs

---
 loopy/kernel/creation.py           |  4 +--
 loopy/kernel/data.py               | 40 +++++++++++++++++++++---
 loopy/kernel/tools.py              | 50 ++++++++++++++++++++++++++++++
 loopy/target/execution.py          | 14 ++++++---
 loopy/target/pyopencl_execution.py |  3 +-
 5 files changed, 97 insertions(+), 14 deletions(-)

diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index bc5c51eb0..816cbfe7b 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -2159,8 +2159,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
 
     creation_plog.done()
 
-    from loopy.kernel.tools import infer_arg_is_output_only
-    knl = infer_arg_is_output_only(knl)
+    from loopy.kernel.tools import infer_args_are_input_output
+    knl = infer_args_are_input_output(knl)
 
     return knl
 
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index e6544b34a..c528e3eeb 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -338,6 +338,8 @@ class KernelArgument(ImmutableRecord):
             dtype = None
 
         kwargs["dtype"] = dtype
+        kwargs["is_output"] = kwargs.pop("is_output", None)
+        kwargs["is_input"] = kwargs.pop("is_input", None)
 
         ImmutableRecord.__init__(self, **kwargs)
 
@@ -354,16 +356,38 @@ class ArrayArg(ArrayBase, KernelArgument):
 
             An instance of :class:`bool`. If set to *True*, recorded to be
             returned from the kernel.
+
+        .. attribute:: is_output
+            An instance of :class:`bool`. If set to *True*, the argument is used
+            to return information to the caller. If set to *False*, then the
+            callee should not write the array during execution.
+
+        .. attribute:: is_input
+            An instance of :class:`bool`. If set to *True*, expected to be
+            provided by the caller. If *False* then the callee should not depend
+            on the state of the array on entry to a function.
+
         """)
 
     allowed_extra_kwargs = [
             "address_space",
-            "is_output_only"]
+            "is_output_only",
+            "is_output",
+            "is_input"]
 
     def __init__(self, *args, **kwargs):
         if "address_space" not in kwargs:
             raise TypeError("'address_space' must be specified")
-        kwargs["is_output_only"] = kwargs.pop("is_output_only", False)
+
+        is_output_only = kwargs.pop("is_output_only", None)
+        if is_output_only is not None:
+            warn("'is_output_only' is deprecated. Use 'is_output', 'is_input'"
+                    " instead.", DeprecationWarning, stacklevel=2)
+            kwargs["is_output"] = is_output_only
+            kwargs["is_input"] = not is_output_only
+        else:
+            kwargs["is_output"] = kwargs.pop("is_output", None)
+            kwargs["is_input"] = kwargs.pop("is_input", None)
 
         super(ArrayArg, self).__init__(*args, **kwargs)
 
@@ -391,7 +415,8 @@ class ArrayArg(ArrayBase, KernelArgument):
         """
         super(ArrayArg, self).update_persistent_hash(key_hash, key_builder)
         key_builder.rec(key_hash, self.address_space)
-        key_builder.rec(key_hash, self.is_output_only)
+        key_builder.rec(key_hash, self.is_output)
+        key_builder.rec(key_hash, self.is_input)
 
 
 # Making this a function prevents incorrect use in isinstance.
@@ -411,6 +436,9 @@ class ConstantArg(ArrayBase, KernelArgument):
     min_target_axes = 0
     max_target_axes = 1
 
+    is_output = False
+    is_input = True
+
     def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written):
         return ast_builder.get_constant_arg_decl(self.name + name_suffix, shape,
                 dtype, is_written)
@@ -432,13 +460,15 @@ class ImageArg(ArrayBase, KernelArgument):
 
 class ValueArg(KernelArgument):
     def __init__(self, name, dtype=None, approximately=1000, target=None,
-            is_output_only=False):
+            is_output_only=None, is_output=False, is_input=True):
 
         KernelArgument.__init__(self, name=name,
                 dtype=dtype,
                 approximately=approximately,
                 target=target,
-                is_output_only=is_output_only)
+                is_output_only=is_output_only,
+                is_output=is_output,
+                is_input=is_input)
 
     def __str__(self):
         import loopy as lp
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index e33d260fb..5b00950e7 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -1864,6 +1864,56 @@ def infer_arg_is_output_only(kernel):
 
     return kernel.copy(args=new_args)
 
+
+def infer_args_are_input_output(kernel):
+    """
+    Returns a copy of *kernel* with the attributes ``is_input`` and
+    ``is_output`` of the arguments set.
+    .. note::
+        If the :attr:`~loopy.ArrayArg.is_output` is not supplied from a user,
+        then the array is inferred as an output argument if it is written at
+        some point in the kernel.
+        If the :attr:`~loopy.ArrayArg.is_input` is not supplied from a user,
+        then the array is inferred as an input argument if it is either read at
+        some point in the kernel or it is neither read nor written.
+    """
+    from loopy.kernel.data import ArrayArg, ValueArg, ConstantArg, ImageArg
+    new_args = []
+
+    for arg in kernel.args:
+        if isinstance(arg, ArrayArg):
+            if arg.is_output is not None:
+                assert isinstance(arg.is_output, bool)
+            else:
+                if arg.name in kernel.get_written_variables():
+                    arg = arg.copy(is_output=True)
+                else:
+                    arg = arg.copy(is_output=False)
+
+            if arg.is_input is not None:
+                assert isinstance(arg.is_input, bool)
+            else:
+                if arg.name in kernel.get_read_variables() or (
+                        (arg.name not in kernel.get_read_variables()) and (
+                            arg.name not in kernel.get_written_variables())):
+                    arg = arg.copy(is_input=True)
+                else:
+                    arg = arg.copy(is_input=False)
+        elif isinstance(arg, (ConstantArg, ImageArg, ValueArg)):
+            pass
+        else:
+            raise NotImplementedError("Unkonwn argument type %s." % type(arg))
+
+        if not (arg.is_input or arg.is_output):
+            raise LoopyError("Kernel argument must be either input or output."
+                    " '{}' in '{}' does not follow it.".format(arg.name,
+                        kernel.name))
+
+        new_args.append(arg)
+
+    return kernel.copy(args=new_args)
+
+
 # }}}
 
 # vim: foldmethod=marker
diff --git a/loopy/target/execution.py b/loopy/target/execution.py
index afc3341cf..84cb616ab 100644
--- a/loopy/target/execution.py
+++ b/loopy/target/execution.py
@@ -725,21 +725,25 @@ class KernelExecutorBase(object):
         self.packing_controller = SeparateArrayPackingController(kernel)
 
         self.output_names = tuple(arg.name for arg in self.kernel.args
-                if arg.name in self.kernel.get_written_variables())
+                if arg.is_output)
 
         self.input_names = tuple(arg.name for arg in self.kernel.args
-                if arg.name in self.kernel.get_read_variables())
+                if arg.is_input)
+
+        from loopy import ArrayArg
+        self.input_array_names = tuple(arg.name for arg in self.kernel.args
+                if arg.is_input and isinstance(arg, ArrayArg))
 
         self.has_runtime_typed_args = any(
                 arg.dtype is None
                 for arg in kernel.args)
 
     def check_for_required_arguments(self, input_args):
-        missing_args = set(self.input_names) - set(input_args)
+        missing_args = set(self.input_array_names) - set(input_args)
         if missing_args != set():
             raise LoopyError(
-                "Kernel arguments '%s' are required but were not passed."
-                % missing_args)
+                "Kernel %s() missing required array arguments: '%s'."
+                % (self.kernel.name, ', '.join(missing_args)))
 
     def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set):
         from loopy.kernel.tools import add_dtypes
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index b57169bac..f443518a5 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -353,9 +353,8 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
         wait_for = kwargs.pop("wait_for", None)
         out_host = kwargs.pop("out_host", None)
 
-        kwargs = self.packing_controller.unpack(kwargs)
-
         self.check_for_required_arguments(kwargs.keys())
+        kwargs = self.packing_controller.unpack(kwargs)
 
         kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs))
 
-- 
GitLab