From e3bb8f37ddffec0e9c998c2950edb330a54df478 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sat, 8 Jul 2017 16:45:31 -0500
Subject: [PATCH 01/18] Add a make_tuple() function to loopy.

This function does trivial things, but it's there to solve the problem
that the reduction neutral element getters are not allowed to store
dtypes (#80).

The function mangler demands that a function knows its type based on
its arguments. For the neutral element getters, this is impossible
because they take zero arguments. The simplest fix I can think of is
to change a call to neutral_element() to a call to make_tuple().

Currently, the tuple code doesn't work yet due to pickling issues.  I
think the root cause is somewhere in
__hackily_ensure_multi_argument_functions_are_scoped_private().
---
 loopy/library/function.py |  3 ++-
 loopy/type_inference.py   |  2 +-
 test/test_target.py       | 13 +++++++++++++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/loopy/library/function.py b/loopy/library/function.py
index efa590371..f3d14516c 100644
--- a/loopy/library/function.py
+++ b/loopy/library/function.py
@@ -25,8 +25,9 @@ THE SOFTWARE.
 
 def default_function_mangler(kernel, name, arg_dtypes):
     from loopy.library.reduction import reduction_function_mangler
+    from loopy.library.tuple import tuple_function_mangler
 
-    manglers = [reduction_function_mangler]
+    manglers = [reduction_function_mangler, tuple_function_mangler]
     for mangler in manglers:
         result = mangler(kernel, name, arg_dtypes)
         if result is not None:
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 78d817ce7..3fb165ead 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -442,7 +442,7 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
                         result_i = comp_dtype_set
                         break
 
-                assert found
+                assert found, var_name
                 if result_i is not None:
                     result.append(result_i)
 
diff --git a/test/test_target.py b/test/test_target.py
index b656383e7..4b09829e1 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -176,6 +176,19 @@ def test_random123(ctx_factory, tp):
     assert (0 <= out).all()
 
 
+def test_tuple():
+    knl = lp.make_kernel(
+            "{ [i]: 0 <= i < 10 }",
+            """
+            a, b = make_tuple(1, 2)
+            """)
+
+    print(
+            lp.generate_code(
+                lp.get_one_scheduled_kernel(
+                    lp.preprocess_kernel(knl)))[0])
+
+
 def test_clamp(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
-- 
GitLab


From b25c5bc238729de8ee4fb0ac258d5f28590d3e15 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sat, 8 Jul 2017 16:54:50 -0500
Subject: [PATCH 02/18] [ci skip] Add missing file.

---
 loopy/library/tuple.py | 69 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 loopy/library/tuple.py

diff --git a/loopy/library/tuple.py b/loopy/library/tuple.py
new file mode 100644
index 000000000..e60d24d70
--- /dev/null
+++ b/loopy/library/tuple.py
@@ -0,0 +1,69 @@
+from __future__ import absolute_import, division, print_function
+
+__copyright__ = "Copyright (C) 2017 Matt Wala"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from loopy.diagnostic import LoopyError
+
+
+def tuple_function_mangler(kernel, name, arg_dtypes):
+    if name == "make_tuple":
+        from loopy.kernel.data import CallMangleInfo
+        return CallMangleInfo(
+                target_name=tuple_function_name(*arg_dtypes),
+                result_dtypes=arg_dtypes,
+                arg_dtypes=arg_dtypes)
+
+    return None
+
+
+def tuple_function_name(dtype0, dtype1):
+    return "loopy_tuple_%s_%s" % (
+            dtype0.numpy_dtype.type.__name__, dtype1.numpy_dtype.type.__name__)
+
+
+def get_tuple_preamble(kernel, func_id, arg_dtypes):
+    print("arg dtypes are", arg_dtypes)
+    name = tuple_function_name(*arg_dtypes)
+    return (name, """
+    inline %(t0)s %(name)s(%(t0)s i0, %(t1)s i1, %(t1)s *o1)
+    {
+      *o1 = i1;
+      return i0;
+    }
+    """ % dict(name=name,
+            t0=kernel.target.dtype_to_typename(arg_dtypes[0]),
+            t1=kernel.target.dtype_to_typename(arg_dtypes[1])))
+
+
+def tuple_preamble_generator(preamble_info):
+    from loopy.target.opencl import OpenCLTarget
+
+    for func in preamble_info.seen_functions:
+        if func.name == "make_tuple":
+            if not isinstance(preamble_info.kernel.target, OpenCLTarget):
+                raise LoopyError("only OpenCL supported for now")
+
+            yield get_tuple_preamble(preamble_info.kernel, func.name,
+                    func.arg_dtypes)
+
+# vim: fdm=marker
-- 
GitLab


From 4dc48bde8aef3d5d95637c0ee954a7f87c68aec4 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sat, 8 Jul 2017 16:56:19 -0500
Subject: [PATCH 03/18] Remove print statement.

---
 loopy/library/tuple.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/loopy/library/tuple.py b/loopy/library/tuple.py
index e60d24d70..dd6b553eb 100644
--- a/loopy/library/tuple.py
+++ b/loopy/library/tuple.py
@@ -42,7 +42,6 @@ def tuple_function_name(dtype0, dtype1):
 
 
 def get_tuple_preamble(kernel, func_id, arg_dtypes):
-    print("arg dtypes are", arg_dtypes)
     name = tuple_function_name(*arg_dtypes)
     return (name, """
     inline %(t0)s %(name)s(%(t0)s i0, %(t1)s i1, %(t1)s *o1)
-- 
GitLab


From 5201ec1f5a6c326e77d5346dbd0fc006a8cab7ae Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sun, 9 Jul 2017 15:43:35 -0500
Subject: [PATCH 04/18] Make the tuple generation work.

---
 loopy/preprocess.py    |  8 +++++++-
 loopy/target/opencl.py |  5 ++++-
 test/test_target.py    | 17 ++++++++++-------
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index c331ccc82..30968630f 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -331,6 +331,9 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
 
     # }}}
 
+    from loopy.type_inference import TypeInferenceMapper
+    type_inf_mapper = TypeInferenceMapper(kernel)
+
     from loopy.kernel.instruction import CallInstruction
     for insn in kernel.instructions:
         if not isinstance(insn, CallInstruction):
@@ -352,6 +355,9 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
 
         FIRST_POINTER_ASSIGNEE_IDX = 1  # noqa
 
+        assignee_dtypes, = type_inf_mapper(
+                insn.expression, return_tuple=True, return_dtype_set=True)
+
         for assignee_nr, assignee_var_name, assignee in zip(
                 range(FIRST_POINTER_ASSIGNEE_IDX, len(assignees)),
                 assignee_var_names[FIRST_POINTER_ASSIGNEE_IDX:],
@@ -383,7 +389,7 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
             new_temporaries[new_assignee_name] = (
                     TemporaryVariable(
                         name=new_assignee_name,
-                        dtype=lp.auto,
+                        dtype=assignee_dtypes[assignee_nr],
                         scope=temp_var_scope.PRIVATE))
 
             from pymbolic import var
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 01e56405e..e70acfeab 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -390,10 +390,13 @@ class OpenCLCASTBuilder(CASTBuilder):
 
     def preamble_generators(self):
         from loopy.library.reduction import reduction_preamble_generator
+        from loopy.library.tuple import tuple_preamble_generator
+
         return (
                 super(OpenCLCASTBuilder, self).preamble_generators() + [
                     opencl_preamble_generator,
-                    reduction_preamble_generator
+                    reduction_preamble_generator,
+                    tuple_preamble_generator
                     ])
 
     # }}}
diff --git a/test/test_target.py b/test/test_target.py
index 4b09829e1..2c6119552 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -176,17 +176,20 @@ def test_random123(ctx_factory, tp):
     assert (0 <= out).all()
 
 
-def test_tuple():
+def test_tuple(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
     knl = lp.make_kernel(
-            "{ [i]: 0 <= i < 10 }",
+            "{ [i]: 0 = i }",
             """
-            a, b = make_tuple(1, 2)
+            a, b = make_tuple(1, 2.)
             """)
 
-    print(
-            lp.generate_code(
-                lp.get_one_scheduled_kernel(
-                    lp.preprocess_kernel(knl)))[0])
+    evt, (a,b) = knl(queue)
+
+    assert a.get() == 1
+    assert b.get() == 2.
 
 
 def test_clamp(ctx_factory):
-- 
GitLab


From c4891c7157be269d83b5963cb5bcceee0b0e3866 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sun, 9 Jul 2017 19:43:27 -0500
Subject: [PATCH 05/18] flake8 fixes

---
 loopy/preprocess.py | 1 -
 test/test_target.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 30968630f..38499cb91 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -385,7 +385,6 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
 
             newly_added_assignments_ids.add(new_assignment_id)
 
-            import loopy as lp
             new_temporaries[new_assignee_name] = (
                     TemporaryVariable(
                         name=new_assignee_name,
diff --git a/test/test_target.py b/test/test_target.py
index 2c6119552..ad0cb7439 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -186,7 +186,7 @@ def test_tuple(ctx_factory):
             a, b = make_tuple(1, 2.)
             """)
 
-    evt, (a,b) = knl(queue)
+    evt, (a, b) = knl(queue)
 
     assert a.get() == 1
     assert b.get() == 2.
-- 
GitLab


From c6898ffa48da9ef24acdc65570e44c9aa95de707 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sun, 9 Jul 2017 20:24:42 -0500
Subject: [PATCH 06/18] Fix argmin and segmented reductions.

---
 loopy/library/reduction.py | 213 +++++++++++++------------------------
 test/test_loopy.py         |  41 +++++++
 2 files changed, 115 insertions(+), 139 deletions(-)

diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index f9648bde7..962b31681 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -123,7 +123,7 @@ class ScalarReductionOperation(ReductionOperation):
 
 class SumReductionOperation(ScalarReductionOperation):
     def neutral_element(self, dtype):
-        return 0
+        return dtype.numpy_dtype.type(0)
 
     def __call__(self, dtype, operand1, operand2):
         return operand1 + operand2
@@ -131,7 +131,7 @@ class SumReductionOperation(ScalarReductionOperation):
 
 class ProductReductionOperation(ScalarReductionOperation):
     def neutral_element(self, dtype):
-        return 1
+        return dtype.numpy_dtype.type(1)
 
     def __call__(self, dtype, operand1, operand2):
         return operand1 * operand2
@@ -189,8 +189,26 @@ class MinReductionOperation(ScalarReductionOperation):
         return var("min")(operand1, operand2)
 
 
+# {{{ base class for symbolic reduction ops
+
+class ReductionOpFunction(FunctionIdentifier):
+    init_arg_names = ("reduction_op",)
+
+    def __init__(self, reduction_op):
+        self.reduction_op = reduction_op
+
+    def __getinitargs__(self):
+        return (self.reduction_op,)
+
+# }}}
+
+
 # {{{ segmented reduction
 
+class SegmentedOp(ReductionOpFunction):
+    pass
+
+
 class _SegmentedScalarReductionOperation(ReductionOperation):
     def __init__(self, **kwargs):
         self.inner_reduction = self.base_reduction_class(**kwargs)
@@ -205,7 +223,9 @@ class _SegmentedScalarReductionOperation(ReductionOperation):
                 segment_flag_dtype.numpy_dtype.type.__name__)
 
     def neutral_element(self, scalar_dtype, segment_flag_dtype):
-        return SegmentedFunction(self, (scalar_dtype, segment_flag_dtype), "init")()
+        scalar_neutral_element = self.inner_reduction.neutral_element(scalar_dtype)
+        return var("make_tuple")(scalar_neutral_element,
+                segment_flag_dtype.numpy_dtype.type(0))
 
     def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype):
         return (self.inner_reduction.result_dtypes(kernel, scalar_dtype)
@@ -221,7 +241,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation):
         return type(self) == type(other)
 
     def __call__(self, dtypes, operand1, operand2):
-        return SegmentedFunction(self, dtypes, "update")(*(operand1 + operand2))
+        return SegmentedOp(self)(*(operand1 + operand2))
 
 
 class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation):
@@ -236,45 +256,13 @@ class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation):
     which = "product"
 
 
-class SegmentedFunction(FunctionIdentifier):
-    init_arg_names = ("reduction_op", "dtypes", "name")
-
-    def __init__(self, reduction_op, dtypes, name):
-        """
-        :arg dtypes: A :class:`tuple` of `(scalar_dtype, segment_flag_dtype)`
-        """
-        self.reduction_op = reduction_op
-        self.dtypes = dtypes
-        self.name = name
-
-    @property
-    def scalar_dtype(self):
-        return self.dtypes[0]
-
-    @property
-    def segment_flag_dtype(self):
-        return self.dtypes[1]
-
-    def __getinitargs__(self):
-        return (self.reduction_op, self.dtypes, self.name)
-
-
-def get_segmented_function_preamble(kernel, func_id):
+def get_segmented_function_preamble(kernel, func_id, arg_dtypes):
     op = func_id.reduction_op
-    prefix = op.prefix(func_id.scalar_dtype, func_id.segment_flag_dtype)
-
-    from pymbolic.mapper.c_code import CCodeMapper
-
-    c_code_mapper = CCodeMapper()
+    scalar_dtype, segment_flag_dtype = arg_dtypes
+    prefix = op.prefix(scalar_dtype, segment_flag_dtype)
 
     return (prefix, """
-    inline %(scalar_t)s %(prefix)s_init(%(segment_flag_t)s *segment_flag_out)
-    {
-        *segment_flag_out = 0;
-        return %(neutral)s;
-    }
-
-    inline %(scalar_t)s %(prefix)s_update(
+    inline %(scalar_t)s %(prefix)s_op(
         %(scalar_t)s op1, %(segment_flag_t)s segment_flag1,
         %(scalar_t)s op2, %(segment_flag_t)s segment_flag2,
         %(segment_flag_t)s *segment_flag_out)
@@ -283,32 +271,36 @@ def get_segmented_function_preamble(kernel, func_id):
         return segment_flag2 ? op2 : %(combined)s;
     }
     """ % dict(
-            scalar_t=kernel.target.dtype_to_typename(func_id.scalar_dtype),
+            scalar_t=kernel.target.dtype_to_typename(scalar_dtype),
             prefix=prefix,
-            segment_flag_t=kernel.target.dtype_to_typename(
-                    func_id.segment_flag_dtype),
-            neutral=c_code_mapper(
-                    op.inner_reduction.neutral_element(func_id.scalar_dtype)),
+            segment_flag_t=kernel.target.dtype_to_typename(segment_flag_dtype),
             combined=op.op % ("op1", "op2"),
             ))
 
-
 # }}}
 
 
 # {{{ argmin/argmax
 
+class ArgExtOp(ReductionOpFunction):
+    pass
+
+
 class _ArgExtremumReductionOperation(ReductionOperation):
     def prefix(self, scalar_dtype, index_dtype):
         return "loopy_arg%s_%s_%s" % (self.which,
-                index_dtype.numpy_dtype.type.__name__,
-                scalar_dtype.numpy_dtype.type.__name__)
+                scalar_dtype.numpy_dtype.type.__name__,
+                index_dtype.numpy_dtype.type.__name__)
 
     def result_dtypes(self, kernel, scalar_dtype, index_dtype):
         return (scalar_dtype, index_dtype)
 
     def neutral_element(self, scalar_dtype, index_dtype):
-        return ArgExtFunction(self, (scalar_dtype, index_dtype), "init")()
+        scalar_neutral_func = (
+                get_ge_neutral if self.neutral_sign < 0 else get_le_neutral)
+        scalar_neutral_element = scalar_neutral_func(scalar_dtype)
+        return var("make_tuple")(scalar_neutral_element,
+                index_dtype.numpy_dtype.type(-1))
 
     def __str__(self):
         return self.which
@@ -324,7 +316,7 @@ class _ArgExtremumReductionOperation(ReductionOperation):
         return 2
 
     def __call__(self, dtypes, operand1, operand2):
-        return ArgExtFunction(self, dtypes, "update")(*(operand1 + operand2))
+        return ArgExtOp(self)(*(operand1 + operand2))
 
 
 class ArgMaxReductionOperation(_ArgExtremumReductionOperation):
@@ -339,44 +331,15 @@ class ArgMinReductionOperation(_ArgExtremumReductionOperation):
     neutral_sign = +1
 
 
-class ArgExtFunction(FunctionIdentifier):
-    init_arg_names = ("reduction_op", "dtypes", "name")
-
-    def __init__(self, reduction_op, dtypes, name):
-        self.reduction_op = reduction_op
-        self.dtypes = dtypes
-        self.name = name
-
-    @property
-    def scalar_dtype(self):
-        return self.dtypes[0]
-
-    @property
-    def index_dtype(self):
-        return self.dtypes[1]
-
-    def __getinitargs__(self):
-        return (self.reduction_op, self.dtypes, self.name)
-
-
-def get_argext_preamble(kernel, func_id):
+def get_argext_preamble(kernel, func_id, arg_dtypes):
     op = func_id.reduction_op
-    prefix = op.prefix(func_id.scalar_dtype, func_id.index_dtype)
-
-    from pymbolic.mapper.c_code import CCodeMapper
+    scalar_dtype = arg_dtypes[0]
+    index_dtype = arg_dtypes[1]
 
-    c_code_mapper = CCodeMapper()
-
-    neutral = get_ge_neutral if op.neutral_sign < 0 else get_le_neutral
+    prefix = op.prefix(scalar_dtype, index_dtype)
 
     return (prefix, """
-    inline %(scalar_t)s %(prefix)s_init(%(index_t)s *index_out)
-    {
-        *index_out = INT_MIN;
-        return %(neutral)s;
-    }
-
-    inline %(scalar_t)s %(prefix)s_update(
+    inline %(scalar_t)s %(prefix)s_op(
         %(scalar_t)s op1, %(index_t)s index1,
         %(scalar_t)s op2, %(index_t)s index2,
         %(index_t)s *index_out)
@@ -393,10 +356,9 @@ def get_argext_preamble(kernel, func_id):
         }
     }
     """ % dict(
-            scalar_t=kernel.target.dtype_to_typename(func_id.scalar_dtype),
+            scalar_t=kernel.target.dtype_to_typename(scalar_dtype),
             prefix=prefix,
-            index_t=kernel.target.dtype_to_typename(func_id.index_dtype),
-            neutral=c_code_mapper(neutral(func_id.scalar_dtype)),
+            index_t=kernel.target.dtype_to_typename(index_dtype),
             comp=op.update_comparison,
             ))
 
@@ -454,76 +416,47 @@ def parse_reduction_op(name):
 
 
 def reduction_function_mangler(kernel, func_id, arg_dtypes):
-    if isinstance(func_id, ArgExtFunction) and func_id.name == "init":
+    if isinstance(func_id, ArgExtOp):
         from loopy.target.opencl import CTarget
         if not isinstance(kernel.target, CTarget):
             raise LoopyError("%s: only C-like targets supported for now" % func_id)
 
         op = func_id.reduction_op
+        scalar_dtype = arg_dtypes[0]
+        index_dtype = arg_dtypes[1]
 
         from loopy.kernel.data import CallMangleInfo
         return CallMangleInfo(
-                target_name="%s_init" % op.prefix(
-                    func_id.scalar_dtype, func_id.index_dtype),
+                target_name="%s_op" % op.prefix(
+                    scalar_dtype, index_dtype),
                 result_dtypes=op.result_dtypes(
-                    kernel, func_id.scalar_dtype, func_id.index_dtype),
-                arg_dtypes=(),
-                )
-
-    elif isinstance(func_id, ArgExtFunction) and func_id.name == "update":
-        from loopy.target.opencl import CTarget
-        if not isinstance(kernel.target, CTarget):
-            raise LoopyError("%s: only C-like targets supported for now" % func_id)
-
-        op = func_id.reduction_op
-
-        from loopy.kernel.data import CallMangleInfo
-        return CallMangleInfo(
-                target_name="%s_update" % op.prefix(
-                    func_id.scalar_dtype, func_id.index_dtype),
-                result_dtypes=op.result_dtypes(
-                    kernel, func_id.scalar_dtype, func_id.index_dtype),
+                    kernel, scalar_dtype, index_dtype),
                 arg_dtypes=(
-                    func_id.scalar_dtype,
-                    kernel.index_dtype,
-                    func_id.scalar_dtype,
-                    kernel.index_dtype),
-                )
-
-    elif isinstance(func_id, SegmentedFunction) and func_id.name == "init":
-        from loopy.target.opencl import CTarget
-        if not isinstance(kernel.target, CTarget):
-            raise LoopyError("%s: only C-like targets supported for now" % func_id)
-
-        op = func_id.reduction_op
-
-        from loopy.kernel.data import CallMangleInfo
-        return CallMangleInfo(
-                target_name="%s_init" % op.prefix(
-                    func_id.scalar_dtype, func_id.segment_flag_dtype),
-                result_dtypes=op.result_dtypes(
-                    kernel, func_id.scalar_dtype, func_id.segment_flag_dtype),
-                arg_dtypes=(),
+                    scalar_dtype,
+                    index_dtype,
+                    scalar_dtype,
+                    index_dtype),
                 )
 
-    elif isinstance(func_id, SegmentedFunction) and func_id.name == "update":
+    elif isinstance(func_id, SegmentedOp):
         from loopy.target.opencl import CTarget
         if not isinstance(kernel.target, CTarget):
             raise LoopyError("%s: only C-like targets supported for now" % func_id)
 
         op = func_id.reduction_op
+        scalar_dtype, segment_flag_dtype = arg_dtypes
 
         from loopy.kernel.data import CallMangleInfo
         return CallMangleInfo(
-                target_name="%s_update" % op.prefix(
-                    func_id.scalar_dtype, func_id.segment_flag_dtype),
+                target_name="%s_op" % op.prefix(
+                    scalar_dtype, segment_flag_dtype),
                 result_dtypes=op.result_dtypes(
-                    kernel, func_id.scalar_dtype, func_id.segment_flag_dtype),
+                    kernel, scalar_dtype, segment_flag_dtype),
                 arg_dtypes=(
-                    func_id.scalar_dtype,
-                    func_id.segment_flag_dtype,
-                    func_id.scalar_dtype,
-                    func_id.segment_flag_dtype),
+                    scalar_dtype,
+                    segment_flag_dtype,
+                    scalar_dtype,
+                    segment_flag_dtype),
                 )
 
     return None
@@ -533,16 +466,18 @@ def reduction_preamble_generator(preamble_info):
     from loopy.target.opencl import OpenCLTarget
 
     for func in preamble_info.seen_functions:
-        if isinstance(func.name, ArgExtFunction):
+        if isinstance(func.name, ArgExtOp):
             if not isinstance(preamble_info.kernel.target, OpenCLTarget):
                 raise LoopyError("only OpenCL supported for now")
 
-            yield get_argext_preamble(preamble_info.kernel, func.name)
+            yield get_argext_preamble(preamble_info.kernel, func.name,
+                    func.arg_dtypes)
 
-        elif isinstance(func.name, SegmentedFunction):
+        elif isinstance(func.name, SegmentedOp):
             if not isinstance(preamble_info.kernel.target, OpenCLTarget):
                 raise LoopyError("only OpenCL supported for now")
 
-            yield get_segmented_function_preamble(preamble_info.kernel, func.name)
+            yield get_segmented_function_preamble(preamble_info.kernel, func.name,
+                    func.arg_dtypes)
 
 # vim: fdm=marker
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 21db62610..ad5fd72b6 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2335,6 +2335,47 @@ def test_kernel_var_name_generator():
     assert vng("b") != "b"
 
 
+def test_complex_argmin(ctx_factory):
+    cl_ctx = ctx_factory()
+    knl = lp.make_kernel(
+            "{[ictr,itgt,idim]: "
+            "0<=itgt<ntargets "
+            "and 0<=ictr<ncenters "
+            "and 0<=idim<ambient_dim}",
+
+            """
+            for itgt
+                for ictr
+                    <> dist_sq = sum(idim,
+                            (tgt[idim,itgt] - center[idim,ictr])**2)
+                    <> in_disk = dist_sq < (radius[ictr]*1.05)**2
+                    <> matches = (
+                            (in_disk
+                                and qbx_forced_limit == 0)
+                            or (in_disk
+                                    and qbx_forced_limit != 0
+                                    and qbx_forced_limit * center_side[ictr] > 0)
+                            )
+
+                    <> post_dist_sq = if(matches, dist_sq, HUGE)
+                end
+                <> min_dist_sq, <> min_ictr = argmin(ictr, ictr, post_dist_sq)
+
+                tgt_to_qbx_center[itgt] = if(min_dist_sq < HUGE, min_ictr, -1)
+            end
+            """)
+
+    knl = lp.fix_parameters(knl, ambient_dim=2)
+    knl = lp.add_and_infer_dtypes(knl, {
+            "tgt,center,radius,HUGE": np.float32, 
+            "center_side,qbx_forced_limit": np.int32,
+            })
+
+    lp.auto_test_vs_ref(knl, cl_ctx, knl, parameters={
+            "HUGE": 1e20, "ncenters": 200, "ntargets": 300,
+            "qbx_forced_limit": 1})
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
-- 
GitLab


From d5222cf99108c2f017caf1f324c180d8916044a1 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sat, 15 Jul 2017 19:07:45 -0500
Subject: [PATCH 07/18] Change tuple assignment to be implemented directly by
 the code generator, so that we can avoid generating a dummy C function.

---
 loopy/library/tuple.py     | 32 +-------------------------------
 loopy/target/c/__init__.py | 33 +++++++++++++++++++++++++++++++++
 loopy/target/opencl.py     |  2 --
 3 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/loopy/library/tuple.py b/loopy/library/tuple.py
index dd6b553eb..ce2865ff5 100644
--- a/loopy/library/tuple.py
+++ b/loopy/library/tuple.py
@@ -29,40 +29,10 @@ def tuple_function_mangler(kernel, name, arg_dtypes):
     if name == "make_tuple":
         from loopy.kernel.data import CallMangleInfo
         return CallMangleInfo(
-                target_name=tuple_function_name(*arg_dtypes),
+                target_name="loopy_make_tuple",
                 result_dtypes=arg_dtypes,
                 arg_dtypes=arg_dtypes)
 
     return None
 
-
-def tuple_function_name(dtype0, dtype1):
-    return "loopy_tuple_%s_%s" % (
-            dtype0.numpy_dtype.type.__name__, dtype1.numpy_dtype.type.__name__)
-
-
-def get_tuple_preamble(kernel, func_id, arg_dtypes):
-    name = tuple_function_name(*arg_dtypes)
-    return (name, """
-    inline %(t0)s %(name)s(%(t0)s i0, %(t1)s i1, %(t1)s *o1)
-    {
-      *o1 = i1;
-      return i0;
-    }
-    """ % dict(name=name,
-            t0=kernel.target.dtype_to_typename(arg_dtypes[0]),
-            t1=kernel.target.dtype_to_typename(arg_dtypes[1])))
-
-
-def tuple_preamble_generator(preamble_info):
-    from loopy.target.opencl import OpenCLTarget
-
-    for func in preamble_info.seen_functions:
-        if func.name == "make_tuple":
-            if not isinstance(preamble_info.kernel.target, OpenCLTarget):
-                raise LoopyError("only OpenCL supported for now")
-
-            yield get_tuple_preamble(preamble_info.kernel, func.name,
-                    func.arg_dtypes)
-
 # vim: fdm=marker
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index e4835a363..ed1ba1ce9 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -648,6 +648,36 @@ class CASTBuilder(ASTBuilderBase):
             lhs_expr, rhs_expr, lhs_dtype):
         raise NotImplementedError("atomic updates in %s" % type(self).__name__)
 
+    def emit_tuple_assignment(self, codegen_state, insn):
+        ecm = codegen_state.expression_to_code_mapper
+
+        parameters = insn.expression.parameters
+        parameter_dtypes = tuple(ecm.infer_type(par) for par in parameters)
+
+        from cgen import Assign, block_if_necessary
+        assignments = []
+
+        for i, (assignee, tgt_dtype) in enumerate(
+                zip(insn.assignees, parameter_dtypes)):
+            if tgt_dtype != ecm.infer_type(assignee):
+                raise LoopyError("type mismatch in %d'th (0-based) left-hand "
+                        "side of instruction '%s'" % (i, insn.id))
+
+            lhs_code = ecm(assignee, prec=PREC_NONE, type_context=None)
+            assignee_var_name = insn.assignee_var_names()[i]
+            lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name)
+            lhs_dtype = lhs_var.dtype
+
+            from loopy.expression import dtype_to_type_context
+            rhs_type_context = dtype_to_type_context(
+                    codegen_state.kernel.target, lhs_dtype)
+            rhs_code = ecm(parameters[i], prec=PREC_NONE,
+                           type_context=rhs_type_context, needed_dtype=lhs_dtype)
+
+            assignments.append(Assign(lhs_code, rhs_code))
+
+        return block_if_necessary(assignments)
+
     def emit_multiple_assignment(self, codegen_state, insn):
         ecm = codegen_state.expression_to_code_mapper
 
@@ -674,6 +704,9 @@ class CASTBuilder(ASTBuilderBase):
 
         assert mangle_result.arg_dtypes is not None
 
+        if mangle_result.target_name == "loopy_make_tuple":
+            return self.emit_tuple_assignment(codegen_state, insn)
+
         from loopy.expression import dtype_to_type_context
         c_parameters = [
                 ecm(par, PREC_NONE,
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index e70acfeab..a5f7562c4 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -390,13 +390,11 @@ class OpenCLCASTBuilder(CASTBuilder):
 
     def preamble_generators(self):
         from loopy.library.reduction import reduction_preamble_generator
-        from loopy.library.tuple import tuple_preamble_generator
 
         return (
                 super(OpenCLCASTBuilder, self).preamble_generators() + [
                     opencl_preamble_generator,
                     reduction_preamble_generator,
-                    tuple_preamble_generator
                     ])
 
     # }}}
-- 
GitLab


From 542e9756758f40c58f0503a4d0c7c993ec3137c2 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sat, 15 Jul 2017 20:18:39 -0500
Subject: [PATCH 08/18] test_nested_scan(): Declare the dtype of tmp (I think
 it's fair that it fails without this.)

---
 test/test_scan.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_scan.py b/test/test_scan.py
index 08754819c..c225c2c1c 100644
--- a/test/test_scan.py
+++ b/test/test_scan.py
@@ -182,6 +182,7 @@ def test_nested_scan(ctx_factory, i_tag, j_tag):
     knl = lp.fix_parameters(knl, n=10)
     knl = lp.tag_inames(knl, dict(i=i_tag, j=j_tag))
 
+    knl = lp.add_dtypes(knl, dict(tmp=int))
     knl = lp.realize_reduction(knl, force_scan=True)
 
     print(knl)
-- 
GitLab


From 45332d8f857b34da1efeb159d7f52b3b63656c1a Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sat, 15 Jul 2017 20:19:37 -0500
Subject: [PATCH 09/18] Tuple private scalar assignment hack: Skip
 make_tuple().

---
 loopy/preprocess.py | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index de7f2b593..c1492789f 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -773,6 +773,21 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
 
         FIRST_POINTER_ASSIGNEE_IDX = 1  # noqa
 
+        param_dtypes = tuple(type_inf_mapper(param)
+                for param in insn.expression.parameters)
+
+        func_id = insn.expression.function
+
+        from pymbolic.primitives import Variable
+        if isinstance(func_id, Variable):
+            func_id = func_id.name
+
+        mangle_result = kernel.mangle_function(func_id, param_dtypes)
+
+        if mangle_result.target_name == "loopy_make_tuple":
+            # Skip loopy_make_tuple. This is lowered without a function call.
+            continue
+
         assignee_dtypes, = type_inf_mapper(
                 insn.expression, return_tuple=True, return_dtype_set=True)
 
@@ -806,7 +821,8 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
             new_temporaries[new_assignee_name] = (
                     TemporaryVariable(
                         name=new_assignee_name,
-                        dtype=assignee_dtypes[assignee_nr],
+                        dtype=assignee_dtypes[assignee_nr].with_target(
+                            kernel.target),
                         scope=temp_var_scope.PRIVATE))
 
             from pymbolic import var
@@ -985,12 +1001,18 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 var_name_gen(id + "_arg" + str(i))
                 for i in range(nresults)]
 
-        for name in temp_var_names:
+        from loopy.type_inference import infer_arg_and_reduction_dtypes_for_reduction_expression
+
+        _, reduction_dtypes = (
+                infer_arg_and_reduction_dtypes_for_reduction_expression(
+                    temp_kernel, expr, unknown_types_ok=False))
+
+        for name, dtype in zip(temp_var_names, reduction_dtypes):
             from loopy.kernel.data import TemporaryVariable, temp_var_scope
             new_temporary_variables[name] = TemporaryVariable(
                     name=name,
                     shape=(),
-                    dtype=lp.auto,
+                    dtype=dtype,
                     scope=temp_var_scope.PRIVATE)
 
         from pymbolic import var
-- 
GitLab


From 31a2bfb021b31def8d0ccf41f6ba1939d1ea310a Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sat, 15 Jul 2017 20:20:13 -0500
Subject: [PATCH 10/18] Fix dtype getting for reduction mangler.

---
 loopy/library/reduction.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index 962b31681..bd085b7e8 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -258,7 +258,8 @@ class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation):
 
 def get_segmented_function_preamble(kernel, func_id, arg_dtypes):
     op = func_id.reduction_op
-    scalar_dtype, segment_flag_dtype = arg_dtypes
+    scalar_dtype = arg_dtypes[0]
+    segment_flag_dtype = arg_dtypes[1]
     prefix = op.prefix(scalar_dtype, segment_flag_dtype)
 
     return (prefix, """
@@ -444,7 +445,8 @@ def reduction_function_mangler(kernel, func_id, arg_dtypes):
             raise LoopyError("%s: only C-like targets supported for now" % func_id)
 
         op = func_id.reduction_op
-        scalar_dtype, segment_flag_dtype = arg_dtypes
+        scalar_dtype = arg_dtypes[0]
+        segment_flag_dtype = arg_dtypes[1]
 
         from loopy.kernel.data import CallMangleInfo
         return CallMangleInfo(
-- 
GitLab


From 1e0696efb71c3e0aa83f0dacd3d50afdef7c9825 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sat, 15 Jul 2017 20:20:35 -0500
Subject: [PATCH 11/18] Type inference for reduction: Make sure reduction arg
 types have a target too.

---
 loopy/type_inference.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 3fb165ead..7b3a67c6b 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -671,6 +671,11 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression(
                 raise LoopyError("failed to determine type of accumulator for "
                         "reduction '%s'" % expr)
 
+    arg_dtypes = tuple(
+            dt.with_target(kernel.target)
+            if dt is not lp.auto else dt
+            for dt in arg_dtypes)
+
     reduction_dtypes = expr.operation.result_dtypes(kernel, *arg_dtypes)
     reduction_dtypes = tuple(
             dt.with_target(kernel.target)
-- 
GitLab


From 28800337b5fed0b72ff9d1bfb7706faa2a2b3048 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sat, 15 Jul 2017 20:23:48 -0500
Subject: [PATCH 12/18] Move complicated argmin test inside test_loopy to
 reduce the chance of a merge conflict.

---
 test/test_loopy.py | 82 +++++++++++++++++++++++-----------------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index 3ac857478..48cb6980a 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2087,6 +2087,47 @@ def test_integer_reduction(ctx_factory):
             assert function(out)
 
 
+def test_complicated_argmin_reduction(ctx_factory):
+    cl_ctx = ctx_factory()
+    knl = lp.make_kernel(
+            "{[ictr,itgt,idim]: "
+            "0<=itgt<ntargets "
+            "and 0<=ictr<ncenters "
+            "and 0<=idim<ambient_dim}",
+
+            """
+            for itgt
+                for ictr
+                    <> dist_sq = sum(idim,
+                            (tgt[idim,itgt] - center[idim,ictr])**2)
+                    <> in_disk = dist_sq < (radius[ictr]*1.05)**2
+                    <> matches = (
+                            (in_disk
+                                and qbx_forced_limit == 0)
+                            or (in_disk
+                                    and qbx_forced_limit != 0
+                                    and qbx_forced_limit * center_side[ictr] > 0)
+                            )
+
+                    <> post_dist_sq = if(matches, dist_sq, HUGE)
+                end
+                <> min_dist_sq, <> min_ictr = argmin(ictr, ictr, post_dist_sq)
+
+                tgt_to_qbx_center[itgt] = if(min_dist_sq < HUGE, min_ictr, -1)
+            end
+            """)
+
+    knl = lp.fix_parameters(knl, ambient_dim=2)
+    knl = lp.add_and_infer_dtypes(knl, {
+            "tgt,center,radius,HUGE": np.float32,
+            "center_side,qbx_forced_limit": np.int32,
+            })
+
+    lp.auto_test_vs_ref(knl, cl_ctx, knl, parameters={
+            "HUGE": 1e20, "ncenters": 200, "ntargets": 300,
+            "qbx_forced_limit": 1})
+
+
 def test_nosync_option_parsing():
     knl = lp.make_kernel(
         "{[i]: 0 <= i < 10}",
@@ -2335,47 +2376,6 @@ def test_kernel_var_name_generator():
     assert vng("b") != "b"
 
 
-def test_complex_argmin(ctx_factory):
-    cl_ctx = ctx_factory()
-    knl = lp.make_kernel(
-            "{[ictr,itgt,idim]: "
-            "0<=itgt<ntargets "
-            "and 0<=ictr<ncenters "
-            "and 0<=idim<ambient_dim}",
-
-            """
-            for itgt
-                for ictr
-                    <> dist_sq = sum(idim,
-                            (tgt[idim,itgt] - center[idim,ictr])**2)
-                    <> in_disk = dist_sq < (radius[ictr]*1.05)**2
-                    <> matches = (
-                            (in_disk
-                                and qbx_forced_limit == 0)
-                            or (in_disk
-                                    and qbx_forced_limit != 0
-                                    and qbx_forced_limit * center_side[ictr] > 0)
-                            )
-
-                    <> post_dist_sq = if(matches, dist_sq, HUGE)
-                end
-                <> min_dist_sq, <> min_ictr = argmin(ictr, ictr, post_dist_sq)
-
-                tgt_to_qbx_center[itgt] = if(min_dist_sq < HUGE, min_ictr, -1)
-            end
-            """)
-
-    knl = lp.fix_parameters(knl, ambient_dim=2)
-    knl = lp.add_and_infer_dtypes(knl, {
-            "tgt,center,radius,HUGE": np.float32, 
-            "center_side,qbx_forced_limit": np.int32,
-            })
-
-    lp.auto_test_vs_ref(knl, cl_ctx, knl, parameters={
-            "HUGE": 1e20, "ncenters": 200, "ntargets": 300,
-            "qbx_forced_limit": 1})
-
-
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
-- 
GitLab


From 2de504011b315dae9a793f36ad2adbea8375c10f Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sat, 15 Jul 2017 20:24:28 -0500
Subject: [PATCH 13/18] Pacify flake8.

---
 loopy/preprocess.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index c1492789f..8172051cc 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -1001,7 +1001,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 var_name_gen(id + "_arg" + str(i))
                 for i in range(nresults)]
 
-        from loopy.type_inference import infer_arg_and_reduction_dtypes_for_reduction_expression
+        from loopy.type_inference import (
+                infer_arg_and_reduction_dtypes_for_reduction_expression)
 
         _, reduction_dtypes = (
                 infer_arg_and_reduction_dtypes_for_reduction_expression(
-- 
GitLab


From e203e212d04e74714316a86be34a1813cf426568 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sun, 16 Jul 2017 14:43:37 -0500
Subject: [PATCH 14/18] Remove tuple.py

---
 loopy/library/function.py | 12 +++++++++++-
 loopy/library/tuple.py    | 38 --------------------------------------
 2 files changed, 11 insertions(+), 39 deletions(-)
 delete mode 100644 loopy/library/tuple.py

diff --git a/loopy/library/function.py b/loopy/library/function.py
index f3d14516c..9d557ac9f 100644
--- a/loopy/library/function.py
+++ b/loopy/library/function.py
@@ -25,7 +25,6 @@ THE SOFTWARE.
 
 def default_function_mangler(kernel, name, arg_dtypes):
     from loopy.library.reduction import reduction_function_mangler
-    from loopy.library.tuple import tuple_function_mangler
 
     manglers = [reduction_function_mangler, tuple_function_mangler]
     for mangler in manglers:
@@ -46,4 +45,15 @@ def single_arg_function_mangler(kernel, name, arg_dtypes):
     return None
 
 
+def tuple_function_mangler(kernel, name, arg_dtypes):
+    if name == "make_tuple":
+        from loopy.kernel.data import CallMangleInfo
+        return CallMangleInfo(
+                target_name="loopy_make_tuple",
+                result_dtypes=arg_dtypes,
+                arg_dtypes=arg_dtypes)
+
+    return None
+
+
 # vim: foldmethod=marker
diff --git a/loopy/library/tuple.py b/loopy/library/tuple.py
deleted file mode 100644
index ce2865ff5..000000000
--- a/loopy/library/tuple.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from __future__ import absolute_import, division, print_function
-
-__copyright__ = "Copyright (C) 2017 Matt Wala"
-
-__license__ = """
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-"""
-
-from loopy.diagnostic import LoopyError
-
-
-def tuple_function_mangler(kernel, name, arg_dtypes):
-    if name == "make_tuple":
-        from loopy.kernel.data import CallMangleInfo
-        return CallMangleInfo(
-                target_name="loopy_make_tuple",
-                result_dtypes=arg_dtypes,
-                arg_dtypes=arg_dtypes)
-
-    return None
-
-# vim: fdm=marker
-- 
GitLab


From 83428f328e9ef433f9422809562d82e6c52d8819 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Wed, 19 Jul 2017 01:04:27 -0500
Subject: [PATCH 15/18] Be less strict about data types in tuples / reductions.

---
 loopy/library/reduction.py |  6 ++++--
 loopy/target/c/__init__.py | 15 ++++-----------
 test/test_scan.py          |  1 -
 3 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index bd085b7e8..3c5f4a142 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -123,7 +123,8 @@ class ScalarReductionOperation(ReductionOperation):
 
 class SumReductionOperation(ScalarReductionOperation):
     def neutral_element(self, dtype):
-        return dtype.numpy_dtype.type(0)
+        # FIXME: Document that we always use an int here.
+        return 0
 
     def __call__(self, dtype, operand1, operand2):
         return operand1 + operand2
@@ -131,7 +132,8 @@ class SumReductionOperation(ScalarReductionOperation):
 
 class ProductReductionOperation(ScalarReductionOperation):
     def neutral_element(self, dtype):
-        return dtype.numpy_dtype.type(1)
+        # FIXME: Document that we always use an int here.
+        return 1
 
     def __call__(self, dtype, operand1, operand2):
         return operand1 * operand2
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index ed1ba1ce9..e9457233f 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -651,18 +651,11 @@ class CASTBuilder(ASTBuilderBase):
     def emit_tuple_assignment(self, codegen_state, insn):
         ecm = codegen_state.expression_to_code_mapper
 
-        parameters = insn.expression.parameters
-        parameter_dtypes = tuple(ecm.infer_type(par) for par in parameters)
-
         from cgen import Assign, block_if_necessary
         assignments = []
 
-        for i, (assignee, tgt_dtype) in enumerate(
-                zip(insn.assignees, parameter_dtypes)):
-            if tgt_dtype != ecm.infer_type(assignee):
-                raise LoopyError("type mismatch in %d'th (0-based) left-hand "
-                        "side of instruction '%s'" % (i, insn.id))
-
+        for i, (assignee, parameter) in enumerate(
+                zip(insn.assignees, insn.expression.parameters)):
             lhs_code = ecm(assignee, prec=PREC_NONE, type_context=None)
             assignee_var_name = insn.assignee_var_names()[i]
             lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name)
@@ -671,8 +664,8 @@ class CASTBuilder(ASTBuilderBase):
             from loopy.expression import dtype_to_type_context
             rhs_type_context = dtype_to_type_context(
                     codegen_state.kernel.target, lhs_dtype)
-            rhs_code = ecm(parameters[i], prec=PREC_NONE,
-                           type_context=rhs_type_context, needed_dtype=lhs_dtype)
+            rhs_code = ecm(parameter, prec=PREC_NONE,
+                    type_context=rhs_type_context, needed_dtype=lhs_dtype)
 
             assignments.append(Assign(lhs_code, rhs_code))
 
diff --git a/test/test_scan.py b/test/test_scan.py
index c225c2c1c..08754819c 100644
--- a/test/test_scan.py
+++ b/test/test_scan.py
@@ -182,7 +182,6 @@ def test_nested_scan(ctx_factory, i_tag, j_tag):
     knl = lp.fix_parameters(knl, n=10)
     knl = lp.tag_inames(knl, dict(i=i_tag, j=j_tag))
 
-    knl = lp.add_dtypes(knl, dict(tmp=int))
     knl = lp.realize_reduction(knl, force_scan=True)
 
     print(knl)
-- 
GitLab


From 37ca145f1c588fd38179d6f20f52f91d14a78de2 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Wed, 19 Jul 2017 01:27:10 -0500
Subject: [PATCH 16/18] Undo changes to preprocess. They were not needed.

---
 loopy/preprocess.py | 36 ++++--------------------------------
 1 file changed, 4 insertions(+), 32 deletions(-)

diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 8172051cc..ced1aaaa1 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -749,9 +749,6 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
 
     # }}}
 
-    from loopy.type_inference import TypeInferenceMapper
-    type_inf_mapper = TypeInferenceMapper(kernel)
-
     from loopy.kernel.instruction import CallInstruction
     for insn in kernel.instructions:
         if not isinstance(insn, CallInstruction):
@@ -773,24 +770,6 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
 
         FIRST_POINTER_ASSIGNEE_IDX = 1  # noqa
 
-        param_dtypes = tuple(type_inf_mapper(param)
-                for param in insn.expression.parameters)
-
-        func_id = insn.expression.function
-
-        from pymbolic.primitives import Variable
-        if isinstance(func_id, Variable):
-            func_id = func_id.name
-
-        mangle_result = kernel.mangle_function(func_id, param_dtypes)
-
-        if mangle_result.target_name == "loopy_make_tuple":
-            # Skip loopy_make_tuple. This is lowered without a function call.
-            continue
-
-        assignee_dtypes, = type_inf_mapper(
-                insn.expression, return_tuple=True, return_dtype_set=True)
-
         for assignee_nr, assignee_var_name, assignee in zip(
                 range(FIRST_POINTER_ASSIGNEE_IDX, len(assignees)),
                 assignee_var_names[FIRST_POINTER_ASSIGNEE_IDX:],
@@ -818,11 +797,11 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
 
             newly_added_assignments_ids.add(new_assignment_id)
 
+            import loopy as lp
             new_temporaries[new_assignee_name] = (
                     TemporaryVariable(
                         name=new_assignee_name,
-                        dtype=assignee_dtypes[assignee_nr].with_target(
-                            kernel.target),
+                        dtype=lp.auto,
                         scope=temp_var_scope.PRIVATE))
 
             from pymbolic import var
@@ -1001,19 +980,12 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 var_name_gen(id + "_arg" + str(i))
                 for i in range(nresults)]
 
-        from loopy.type_inference import (
-                infer_arg_and_reduction_dtypes_for_reduction_expression)
-
-        _, reduction_dtypes = (
-                infer_arg_and_reduction_dtypes_for_reduction_expression(
-                    temp_kernel, expr, unknown_types_ok=False))
-
-        for name, dtype in zip(temp_var_names, reduction_dtypes):
+        for name in temp_var_names:
             from loopy.kernel.data import TemporaryVariable, temp_var_scope
             new_temporary_variables[name] = TemporaryVariable(
                     name=name,
                     shape=(),
-                    dtype=dtype,
+                    dtype=lp.auto,
                     scope=temp_var_scope.PRIVATE)
 
         from pymbolic import var
-- 
GitLab


From 896e16df432ca08bf41b960fef9ec8742ab712cd Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Wed, 19 Jul 2017 01:27:47 -0500
Subject: [PATCH 17/18] Undo changes to type inference. They were not needed.

---
 loopy/type_inference.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 45da8eb3e..409cbbc5e 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -443,7 +443,7 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
                         result_i = comp_dtype_set
                         break
 
-                assert found, var_name
+                assert found
                 if result_i is not None:
                     result.append(result_i)
 
@@ -672,11 +672,6 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression(
                 raise LoopyError("failed to determine type of accumulator for "
                         "reduction '%s'" % expr)
 
-    arg_dtypes = tuple(
-            dt.with_target(kernel.target)
-            if dt is not lp.auto else dt
-            for dt in arg_dtypes)
-
     reduction_dtypes = expr.operation.result_dtypes(kernel, *arg_dtypes)
     reduction_dtypes = tuple(
             dt.with_target(kernel.target)
-- 
GitLab


From c106a828aa2417ea30a35800e8ba839f484788d8 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Wed, 19 Jul 2017 01:30:57 -0500
Subject: [PATCH 18/18] Add a name attribute to ReductionOpFunction.

---
 loopy/library/reduction.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index 3c5f4a142..0e5a093b7 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -202,6 +202,10 @@ class ReductionOpFunction(FunctionIdentifier):
     def __getinitargs__(self):
         return (self.reduction_op,)
 
+    @property
+    def name(self):
+        return self.__class__.__name__
+
 # }}}
 
 
-- 
GitLab