diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 0d8e771954cf26cc11747e745946389420fa5e1b..9acde7e9f0ed70f6f336d7d12cbf39c1afb63f85 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -37,6 +37,7 @@ from loopy.version import DATA_MODEL_VERSION
 from loopy.kernel.data import make_assignment
 # for the benefit of loopy.statistics, for now
 from loopy.type_inference import infer_unknown_types
+from loopy.codegen import Unvectorizable
 
 import logging
 logger = logging.getLogger(__name__)
@@ -545,7 +546,42 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
             assert len(acc_vars) == 1
             return acc_vars[0][outer_local_iname_vars + (0,)]
         else:
-            return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars]
+            return tuple(acc_var[outer_local_iname_vars + (0,)]
+                         for acc_var in acc_vars)
+    # }}}
+
+    # {{{ reduction of vector iname
+
+    def map_reduction_vector(expr, rec, nresults, arg_dtype, reduction_dtypes):
+        try:
+            builder = kernel.target.get_device_ast_builder()
+            return builder.emit_vector_reduction(expr)
+        except Unvectorizable:
+            # Provide an unrolled fallback
+            from loopy.kernel.data import VectorizeTag
+            iname, = tuple(i for i in expr.inames
+                           if isinstance(kernel.iname_to_tag[i], VectorizeTag))
+
+            # Extract the vector length from the domain
+            bound = kernel.get_iname_bounds(iname, constants_only=True)
+            from loopy.isl_helpers import static_max_of_pw_aff
+            from loopy.symbolic import pw_aff_to_expr
+            length_aff = static_max_of_pw_aff(bound.size, constants_only=True)
+            length = int(pw_aff_to_expr(length_aff))
+
+            # The below code should return a neutral element, if the loop domain
+            # is empty - can that actually happen?
+            assert length > 0
+
+            # Unroll the reduction
+            from pymbolic import substitute, var
+            result = substitute(expr.expr, variable_assignments={var(iname): 0})
+            for idx in range(1, length):
+                newi = substitute(expr.expr, variable_assignments={var(iname): idx})
+                result = expr.operation(arg_dtype, result, newi,
+                                        set(expr.inames) - set([iname]))
+
+            return result
     # }}}
 
     # {{{ seq/par dispatch
@@ -581,6 +617,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
         n_sequential = 0
         n_local_par = 0
+        n_vec = 0
 
         from loopy.kernel.data import (
                 LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
@@ -596,9 +633,12 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
             elif isinstance(iname_tag, LocalIndexTagBase):
                 n_local_par += 1
 
-            elif isinstance(iname_tag, (ParallelTag, VectorizeTag)):
-                raise LoopyError("the only form of parallelism supported "
-                        "by reductions is 'local'--found iname '%s' "
+            elif isinstance(iname_tag, VectorizeTag):
+                n_vec += 1
+
+            elif isinstance(iname_tag, ParallelTag):
+                raise LoopyError("the only forms of parallelism supported "
+                        "by reductions is 'local' and 'vec'--found iname '%s' "
                         "tagged '%s'"
                         % (iname, type(iname_tag).__name__))
 
@@ -619,6 +659,14 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                     "before code generation."
                     % ", ".join(expr.inames))
 
+        if n_vec > 1:
+            raise LoopyError("Reduction over '%s' contains more than one iname"
+                    "tagged as vectorized." % ", ".join(expr.inames))
+
+        if n_vec:
+            return map_reduction_vector(expr, rec, nresults, arg_dtype,
+                    reduction_dtypes)
+
         if n_sequential:
             assert n_local_par == 0
             return map_reduction_seq(expr, rec, nresults, arg_dtype,
@@ -662,82 +710,79 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
         nresults = len(insn.assignees)
 
-        # Run reduction expansion.
+        # Do the expansion of reductions
         from loopy.symbolic import Reduction
         if isinstance(insn.expression, Reduction) and nresults > 1:
             new_expressions = cb_mapper(insn.expression, nresults=nresults)
         else:
             new_expressions = (cb_mapper(insn.expression),)
 
-        if generated_insns:
-            # An expansion happened, so insert the generated stuff plus
-            # ourselves back into the queue.
-
-            kwargs = insn.get_copy_kwargs(
-                    depends_on=insn.depends_on
-                    | frozenset(new_insn_add_depends_on),
-                    no_sync_with=insn.no_sync_with
-                    | frozenset(new_insn_add_no_sync_with),
-                    within_inames=(
-                        temp_kernel.insn_inames(insn)
-                        | new_insn_add_within_inames))
-
-            kwargs.pop("id")
-            kwargs.pop("expression")
-            kwargs.pop("assignee", None)
-            kwargs.pop("assignees", None)
-            kwargs.pop("temp_var_type", None)
-            kwargs.pop("temp_var_types", None)
-
-            if isinstance(insn.expression, Reduction) and nresults > 1:
-                replacement_insns = [
-                        lp.Assignment(
-                            id=insn_id_gen(insn.id),
-                            assignee=assignee,
-                            expression=new_expr,
-                            **kwargs)
-                        for assignee, new_expr in zip(
-                            insn.assignees, new_expressions)]
+            # Find out whether we actually expanded something.
+            # If not, we save this insn for the final kernel and continue
+            if new_expressions == (insn.expression,):
+                assert not new_insn_add_depends_on
+                new_insns.append(insn)
+                continue
+
+        # An expansion happened, so insert the generated stuff plus
+        # ourselves back into the queue.
+        kwargs = insn.get_copy_kwargs(
+                depends_on=insn.depends_on
+                | frozenset(new_insn_add_depends_on),
+                no_sync_with=insn.no_sync_with
+                | frozenset(new_insn_add_no_sync_with),
+                within_inames=(
+                    temp_kernel.insn_inames(insn)
+                    | new_insn_add_within_inames))
 
-            else:
-                new_expr, = new_expressions
-                replacement_insns = [
-                        make_assignment(
-                            id=insn_id_gen(insn.id),
-                            assignees=insn.assignees,
-                            expression=new_expr,
-                            **kwargs)
-                        ]
-
-            insn_id_replacements[insn.id] = [
-                    rinsn.id for rinsn in replacement_insns]
-
-            insn_queue = generated_insns + replacement_insns + insn_queue
-
-            # The reduction expander needs an up-to-date kernel
-            # object to find dependencies. Keep temp_kernel up-to-date.
-
-            temp_kernel = kernel.copy(
-                    instructions=new_insns + insn_queue,
-                    temporary_variables=new_temporary_variables,
-                    domains=domains)
-            temp_kernel = lp.replace_instruction_ids(
-                    temp_kernel, insn_id_replacements)
+        kwargs.pop("id")
+        kwargs.pop("expression")
+        kwargs.pop("assignee", None)
+        kwargs.pop("assignees", None)
+        kwargs.pop("temp_var_type", None)
+        kwargs.pop("temp_var_types", None)
 
+        if isinstance(insn.expression, Reduction) and nresults > 1:
+            replacement_insns = [
+                    lp.Assignment(
+                        id=insn_id_gen(insn.id),
+                        assignee=assignee,
+                        expression=new_expr,
+                        **kwargs)
+                    for assignee, new_expr in zip(
+                        insn.assignees, new_expressions)]
         else:
-            # nothing happened, we're done with insn
-            assert not new_insn_add_depends_on
-
-            new_insns.append(insn)
+            new_expr, = new_expressions
+            replacement_insns = [
+                    make_assignment(
+                        id=insn_id_gen(insn.id),
+                        assignees=insn.assignees,
+                        expression=new_expr,
+                        **kwargs)
+                    ]
+
+        insn_id_replacements[insn.id] = [
+                rinsn.id for rinsn in replacement_insns]
+
+        insn_queue = generated_insns + replacement_insns + insn_queue
+
+        # The reduction expander needs an up-to-date kernel
+        # object to find dependencies. Keep temp_kernel up-to-date.
+
+        temp_kernel = kernel.copy(
+                instructions=new_insns + insn_queue,
+                temporary_variables=new_temporary_variables,
+                domains=domains)
+        temp_kernel = lp.replace_instruction_ids(
+                temp_kernel, insn_id_replacements)
 
     kernel = kernel.copy(
-            instructions=new_insns,
-            temporary_variables=new_temporary_variables,
-            domains=domains)
+             instructions=new_insns,
+             temporary_variables=new_temporary_variables,
+             domains=domains)
 
     kernel = lp.replace_instruction_ids(kernel, insn_id_replacements)
-
-    kernel = lp.tag_inames(kernel, new_iname_tags)
+    kernel = lp.tag_inames(temp_kernel, new_iname_tags)
 
     return kernel
 
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 5d5743bae322fc59c989cafd85122c8ca619c422..69e6020abcdaf4c446f36f48f2dbb40313b2d6b7 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -41,6 +41,8 @@ __doc__ = """
 
 """
 
+from loopy.codegen import Unvectorizable
+
 
 class TargetBase(object):
     """Base class for all targets, i.e. different combinations of code that
@@ -103,7 +105,7 @@ class TargetBase(object):
 
     def get_device_ast_builder(self):
         """
-        :returns: a class implementing :class:`ASTBuilderBase` for the host code
+        :returns: a class implementing :class:`ASTBuilderBase` for the device code
         """
         raise NotImplementedError()
 
@@ -210,6 +212,9 @@ class ASTBuilderBase(object):
             static_lbound, static_ubound, inner):
         raise NotImplementedError()
 
+    def emit_vector_reduction(self, expr):
+        raise Unvectorizable()
+
     def emit_if(self, condition_str, ast):
         raise NotImplementedError()
 
diff --git a/test/test_reduction.py b/test/test_reduction.py
index 86e72c0c6644b7b9837a6d74da756c58344b1d6f..26e9a7a47c12f0e4a441ade883d4bd786877a739 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -405,6 +405,29 @@ def test_parallel_multi_output_reduction():
     # TODO: Add functional test
 
 
+def test_vector_reduction_fallback(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<4}",
+            """
+            b = sum(i, a[i])
+            """,
+            [lp.GlobalArg("a", np.float32, shape=(4,), dim_tags=("vec",)),
+             lp.GlobalArg("b", np.float32, shape=())]
+            )
+
+    knl = lp.tag_inames(knl, [("i", "vec")])
+    knl = lp.preprocess_kernel(knl)
+    print(knl)
+
+    a = np.random.randn(4)
+    ref = sum(a)
+    evt, (a, b) = knl(queue, a=a)
+    assert b.get() == ref
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])