From 22d6ad35266bce2919df5c79fa92d335852fa44e Mon Sep 17 00:00:00 2001
From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
Date: Mon, 27 Mar 2017 14:51:26 +0200
Subject: [PATCH 1/5] First steps towards implementing vector iname reductions

This commit
* adds a callback for reductions of vec-tagged inames
* removes the logic "instruction generated => reduction was realized"
  because vector reductions do not necessarily write new instructions
* adds emit_vector_reduction to ASTBuilderBase. By default,
  it throws Unvectorizable.
---
 loopy/preprocess.py      | 166 ++++++++++++++++++++++-----------------
 loopy/target/__init__.py |   7 +-
 2 files changed, 102 insertions(+), 71 deletions(-)

diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 0d8e77195..ba45d192c 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -37,6 +37,7 @@ from loopy.version import DATA_MODEL_VERSION
 from loopy.kernel.data import make_assignment
 # for the benefit of loopy.statistics, for now
 from loopy.type_inference import infer_unknown_types
+from loopy.codegen import Unvectorizable
 
 import logging
 logger = logging.getLogger(__name__)
@@ -548,6 +549,15 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
             return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars]
     # }}}
 
+    # {{{ reduction of vector iname
+
+    def map_reduction_vector(expr, rec, nresults, arg_dtype, reduction_dtypes):
+        builder = kernel.target.get_device_ast_builder()
+        return builder.emit_vector_reduction(expr)
+
+    # }}}
+
+
     # {{{ seq/par dispatch
 
     def map_reduction(expr, rec, nresults=1):
@@ -581,6 +591,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
         n_sequential = 0
         n_local_par = 0
+        n_vec = 0
 
         from loopy.kernel.data import (
                 LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
@@ -596,9 +607,12 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
             elif isinstance(iname_tag, LocalIndexTagBase):
                 n_local_par += 1
 
-            elif isinstance(iname_tag, (ParallelTag, VectorizeTag)):
-                raise LoopyError("the only form of parallelism supported "
-                        "by reductions is 'local'--found iname '%s' "
+            elif isinstance(iname_tag, VectorizeTag):
+                n_vec += 1
+
+            elif isinstance(iname_tag, ParallelTag):
+                raise LoopyError("the only forms of parallelism supported "
+                        "by reductions is 'local' and 'vec'--found iname '%s' "
                         "tagged '%s'"
                         % (iname, type(iname_tag).__name__))
 
@@ -619,6 +633,20 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                     "before code generation."
                     % ", ".join(expr.inames))
 
+        if n_vec > 1:
+            raise LoopyError("Reduction over '%s' contains more than one iname"
+                    "tagged as vectorized." % ", ".join(expr.inames))
+
+        if n_vec:
+            try:
+                #TODO this one should go away for sure
+                assert n_sequential == 0
+                return map_reduction_vector(expr, rec, nresults, arg_dtype,
+                        reduction_dtypes)
+            except Unvectorizable:
+                # Fall back to implementing this as a sequential reduction
+                n_sequential += 1
+
         if n_sequential:
             assert n_local_par == 0
             return map_reduction_seq(expr, rec, nresults, arg_dtype,
@@ -662,82 +690,80 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
         nresults = len(insn.assignees)
 
-        # Run reduction expansion.
-        from loopy.symbolic import Reduction
-        if isinstance(insn.expression, Reduction) and nresults > 1:
-            new_expressions = cb_mapper(insn.expression, nresults=nresults)
-        else:
-            new_expressions = (cb_mapper(insn.expression),)
+        # Do the expansion of reductions
+        new_expressions = cb_mapper(insn.expression, nresults=nresults)
 
-        if generated_insns:
-            # An expansion happened, so insert the generated stuff plus
-            # ourselves back into the queue.
+        # Find out whether we actually expanded something and process
+        # the results if so
+        if new_expressions == insn.expression:
+            assert not new_insn_add_depends_on
+            new_insns.append(insn)
+            continue
 
-            kwargs = insn.get_copy_kwargs(
-                    depends_on=insn.depends_on
-                    | frozenset(new_insn_add_depends_on),
-                    no_sync_with=insn.no_sync_with
-                    | frozenset(new_insn_add_no_sync_with),
-                    within_inames=(
-                        temp_kernel.insn_inames(insn)
-                        | new_insn_add_within_inames))
-
-            kwargs.pop("id")
-            kwargs.pop("expression")
-            kwargs.pop("assignee", None)
-            kwargs.pop("assignees", None)
-            kwargs.pop("temp_var_type", None)
-            kwargs.pop("temp_var_types", None)
-
-            if isinstance(insn.expression, Reduction) and nresults > 1:
-                replacement_insns = [
-                        lp.Assignment(
-                            id=insn_id_gen(insn.id),
-                            assignee=assignee,
-                            expression=new_expr,
-                            **kwargs)
-                        for assignee, new_expr in zip(
-                            insn.assignees, new_expressions)]
+        if not isinstance(new_expressions, tuple):
+            new_expressions = (new_expressions,)
 
-            else:
-                new_expr, = new_expressions
-                replacement_insns = [
-                        make_assignment(
-                            id=insn_id_gen(insn.id),
-                            assignees=insn.assignees,
-                            expression=new_expr,
-                            **kwargs)
-                        ]
-
-            insn_id_replacements[insn.id] = [
-                    rinsn.id for rinsn in replacement_insns]
-
-            insn_queue = generated_insns + replacement_insns + insn_queue
-
-            # The reduction expander needs an up-to-date kernel
-            # object to find dependencies. Keep temp_kernel up-to-date.
-
-            temp_kernel = kernel.copy(
-                    instructions=new_insns + insn_queue,
-                    temporary_variables=new_temporary_variables,
-                    domains=domains)
-            temp_kernel = lp.replace_instruction_ids(
-                    temp_kernel, insn_id_replacements)
+        # An expansion happened, so insert the generated stuff plus
+        # ourselves back into the queue.
 
-        else:
-            # nothing happened, we're done with insn
-            assert not new_insn_add_depends_on
+        kwargs = insn.get_copy_kwargs(
+                depends_on=insn.depends_on
+                | frozenset(new_insn_add_depends_on),
+                no_sync_with=insn.no_sync_with
+                | frozenset(new_insn_add_no_sync_with),
+                within_inames=(
+                    temp_kernel.insn_inames(insn)
+                    | new_insn_add_within_inames))
 
-            new_insns.append(insn)
+        kwargs.pop("id")
+        kwargs.pop("expression")
+        kwargs.pop("assignee", None)
+        kwargs.pop("assignees", None)
+        kwargs.pop("temp_var_type", None)
+        kwargs.pop("temp_var_types", None)
+
+        from loopy.symbolic import Reduction
+        if isinstance(insn.expression, Reduction) and nresults > 1:
+            replacement_insns = [
+                    lp.Assignment(
+                        id=insn_id_gen(insn.id),
+                        assignee=assignee,
+                        expression=new_expr,
+                        **kwargs)
+                    for assignee, new_expr in zip(
+                        insn.assignees, new_expressions)]
+        else:
+            new_expr, = new_expressions
+            replacement_insns = [
+                    make_assignment(
+                        id=insn_id_gen(insn.id),
+                        assignees=insn.assignees,
+                        expression=new_expr,
+                        **kwargs)
+                    ]
+
+        insn_id_replacements[insn.id] = [
+                rinsn.id for rinsn in replacement_insns]
+
+        insn_queue = generated_insns + replacement_insns + insn_queue
+
+        # The reduction expander needs an up-to-date kernel
+        # object to find dependencies. Keep temp_kernel up-to-date.
+
+        temp_kernel = kernel.copy(
+                instructions=new_insns + insn_queue,
+                temporary_variables=new_temporary_variables,
+                domains=domains)
+        temp_kernel = lp.replace_instruction_ids(
+                temp_kernel, insn_id_replacements)
 
     kernel = kernel.copy(
-            instructions=new_insns,
-            temporary_variables=new_temporary_variables,
-            domains=domains)
+             instructions=new_insns,
+             temporary_variables=new_temporary_variables,
+             domains=domains)
 
     kernel = lp.replace_instruction_ids(kernel, insn_id_replacements)
-
-    kernel = lp.tag_inames(kernel, new_iname_tags)
+    kernel = lp.tag_inames(temp_kernel, new_iname_tags)
 
     return kernel
 
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 5d5743bae..69e6020ab 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -41,6 +41,8 @@ __doc__ = """
 
 """
 
+from loopy.codegen import Unvectorizable
+
 
 class TargetBase(object):
     """Base class for all targets, i.e. different combinations of code that
@@ -103,7 +105,7 @@ class TargetBase(object):
 
     def get_device_ast_builder(self):
         """
-        :returns: a class implementing :class:`ASTBuilderBase` for the host code
+        :returns: a class implementing :class:`ASTBuilderBase` for the device code
         """
         raise NotImplementedError()
 
@@ -210,6 +212,9 @@ class ASTBuilderBase(object):
             static_lbound, static_ubound, inner):
         raise NotImplementedError()
 
+    def emit_vector_reduction(self, expr):
+        raise Unvectorizable()
+
     def emit_if(self, condition_str, ast):
         raise NotImplementedError()
 
-- 
GitLab


From 193ed2b30640d6b5a8d0a76ad94f8c42b1db3e5e Mon Sep 17 00:00:00 2001
From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
Date: Mon, 27 Mar 2017 15:17:54 +0200
Subject: [PATCH 2/5] [bugfix] Unify return type of reduction callbacks to
 tuple

All the others use it already, and we want to use hashability.
---
 loopy/preprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index ba45d192c..7ca6dbbb8 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -546,7 +546,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
             assert len(acc_vars) == 1
             return acc_vars[0][outer_local_iname_vars + (0,)]
         else:
-            return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars]
+            return tuple(acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars)
     # }}}
 
     # {{{ reduction of vector iname
-- 
GitLab


From 079764f90f1840841ae563c20b95b12a2e93fccb Mon Sep 17 00:00:00 2001
From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
Date: Mon, 27 Mar 2017 15:35:10 +0200
Subject: [PATCH 3/5] [bugfix] Do not pass nresults into the callback, if == 1

It cant handle it...
---
 loopy/preprocess.py | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 7ca6dbbb8..3cf703002 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -546,7 +546,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
             assert len(acc_vars) == 1
             return acc_vars[0][outer_local_iname_vars + (0,)]
         else:
-            return tuple(acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars)
+            return tuple(acc_var[outer_local_iname_vars + (0,)]
+                         for acc_var in acc_vars)
     # }}}
 
     # {{{ reduction of vector iname
@@ -557,7 +558,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
     # }}}
 
-
     # {{{ seq/par dispatch
 
     def map_reduction(expr, rec, nresults=1):
@@ -691,21 +691,21 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
         nresults = len(insn.assignees)
 
         # Do the expansion of reductions
-        new_expressions = cb_mapper(insn.expression, nresults=nresults)
-
-        # Find out whether we actually expanded something and process
-        # the results if so
-        if new_expressions == insn.expression:
-            assert not new_insn_add_depends_on
-            new_insns.append(insn)
-            continue
+        from loopy.symbolic import Reduction
+        if isinstance(insn.expression, Reduction) and nresults > 1:
+            new_expressions = cb_mapper(insn.expression, nresults=nresults)
+        else:
+            new_expressions = (cb_mapper(insn.expression),)
 
-        if not isinstance(new_expressions, tuple):
-            new_expressions = (new_expressions,)
+            # Find out whether we actually expanded something.
+            # If not, we save this insn for the final kernel and continue
+            if new_expressions == (insn.expression,):
+                assert not new_insn_add_depends_on
+                new_insns.append(insn)
+                continue
 
         # An expansion happened, so insert the generated stuff plus
         # ourselves back into the queue.
-
         kwargs = insn.get_copy_kwargs(
                 depends_on=insn.depends_on
                 | frozenset(new_insn_add_depends_on),
@@ -722,7 +722,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
         kwargs.pop("temp_var_type", None)
         kwargs.pop("temp_var_types", None)
 
-        from loopy.symbolic import Reduction
         if isinstance(insn.expression, Reduction) and nresults > 1:
             replacement_insns = [
                     lp.Assignment(
-- 
GitLab


From a02f04cc334c889f0bac3a94f6b4c438934aa3d2 Mon Sep 17 00:00:00 2001
From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
Date: Tue, 28 Mar 2017 13:25:58 +0200
Subject: [PATCH 4/5] Implement an unrolled fallback for vector reductions

---
 loopy/preprocess.py | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 3cf703002..9acde7e9f 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -553,9 +553,35 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
     # {{{ reduction of vector iname
 
     def map_reduction_vector(expr, rec, nresults, arg_dtype, reduction_dtypes):
-        builder = kernel.target.get_device_ast_builder()
-        return builder.emit_vector_reduction(expr)
+        try:
+            builder = kernel.target.get_device_ast_builder()
+            return builder.emit_vector_reduction(expr)
+        except Unvectorizable:
+            # Provide an unrolled fallback
+            from loopy.kernel.data import VectorizeTag
+            iname, = tuple(i for i in expr.inames
+                           if isinstance(kernel.iname_to_tag[i], VectorizeTag))
+
+            # Extract the vector length from the domain
+            bound = kernel.get_iname_bounds(iname, constants_only=True)
+            from loopy.isl_helpers import static_max_of_pw_aff
+            from loopy.symbolic import pw_aff_to_expr
+            length_aff = static_max_of_pw_aff(bound.size, constants_only=True)
+            length = int(pw_aff_to_expr(length_aff))
+
+            # The below code should return a neutral element, if the loop domain
+            # is empty - can that actually happen?
+            assert length > 0
+
+            # Unroll the reduction
+            from pymbolic import substitute, var
+            result = substitute(expr.expr, variable_assignments={var(iname): 0})
+            for idx in range(1, length):
+                newi = substitute(expr.expr, variable_assignments={var(iname): idx})
+                result = expr.operation(arg_dtype, result, newi,
+                                        set(expr.inames) - set([iname]))
 
+            return result
     # }}}
 
     # {{{ seq/par dispatch
@@ -638,14 +664,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                     "tagged as vectorized." % ", ".join(expr.inames))
 
         if n_vec:
-            try:
-                #TODO this one should go away for sure
-                assert n_sequential == 0
-                return map_reduction_vector(expr, rec, nresults, arg_dtype,
-                        reduction_dtypes)
-            except Unvectorizable:
-                # Fall back to implementing this as a sequential reduction
-                n_sequential += 1
+            return map_reduction_vector(expr, rec, nresults, arg_dtype,
+                    reduction_dtypes)
 
         if n_sequential:
             assert n_local_par == 0
-- 
GitLab


From 58ea2c5cf95fd33b906cd0b365144d156f647a93 Mon Sep 17 00:00:00 2001
From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
Date: Tue, 28 Mar 2017 13:39:20 +0200
Subject: [PATCH 5/5] Add a test for vector reduction fallback

---
 test/test_reduction.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/test/test_reduction.py b/test/test_reduction.py
index 86e72c0c6..26e9a7a47 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -405,6 +405,29 @@ def test_parallel_multi_output_reduction():
     # TODO: Add functional test
 
 
+def test_vector_reduction_fallback(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<4}",
+            """
+            b = sum(i, a[i])
+            """,
+            [lp.GlobalArg("a", np.float32, shape=(4,), dim_tags=("vec",)),
+             lp.GlobalArg("b", np.float32, shape=())]
+            )
+
+    knl = lp.tag_inames(knl, [("i", "vec")])
+    knl = lp.preprocess_kernel(knl)
+    print(knl)
+
+    a = np.random.randn(4)
+    ref = sum(a)
+    evt, (a, b) = knl(queue, a=a)
+    assert b.get() == ref
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
-- 
GitLab