diff --git a/examples/python/vector-types.py b/examples/python/vector-types.py
index 328aea154bfee653a6fcbf3fc8ad74b08375e13d..82aadb817c2fa1f75c91da870650436cf0049dc9 100644
--- a/examples/python/vector-types.py
+++ b/examples/python/vector-types.py
@@ -14,8 +14,9 @@ knl = lp.make_kernel(
         "out[i] = 2*a[i]")
 
 knl = lp.set_options(knl, write_code=True)
+knl = lp.fix_parameters(knl, n=n)
 knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec")
 knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4)
 knl = lp.tag_array_axes(knl, "a,out", "C,vec")
 
-knl(queue, a=a.reshape(-1, 4), n=n)
+knl(queue, a=a.reshape(-1, 4))
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 11f874e1bd90bcfc1fe4595345c1b1efb2e6a35f..43d5a6112fae8b38effb3dd7a0f12fa4e51058ee 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -24,7 +24,7 @@ THE SOFTWARE.
 
 import six
 
-from loopy.diagnostic import LoopyError, warn
+from loopy.diagnostic import LoopyError, warn_with_kernel
 from pytools import ImmutableRecord
 import islpy as isl
 
@@ -178,6 +178,11 @@ class CodeGenerationState(object):
 
         None or an instance of :class:`VectorizationInfo`
 
+    .. attribute:: insn_was_not_vectorizable
+
+        If true, we have a call to :func:`try_vectorized` has failed, and we are
+        in the :func:`unvectorize` fallback
+
     .. attribute:: is_generating_device_code
 
     .. attribute:: gen_program_name
@@ -196,7 +201,8 @@ class CodeGenerationState(object):
             vectorization_info=None, var_name_generator=None,
             is_generating_device_code=None,
             gen_program_name=None,
-            schedule_index_end=None):
+            schedule_index_end=None,
+            insn_was_not_vectorizable=False):
         self.kernel = kernel
         self.implemented_data_info = implemented_data_info
         self.implemented_domain = implemented_domain
@@ -211,6 +217,7 @@ class CodeGenerationState(object):
         self.is_generating_device_code = is_generating_device_code
         self.gen_program_name = gen_program_name
         self.schedule_index_end = schedule_index_end
+        self.insn_was_not_vectorizable = insn_was_not_vectorizable
 
     # {{{ copy helpers
 
@@ -219,7 +226,8 @@ class CodeGenerationState(object):
             var_subst_map=None, vectorization_info=None,
             is_generating_device_code=None,
             gen_program_name=None,
-            schedule_index_end=None):
+            schedule_index_end=None,
+            removed_predicates=frozenset()):
 
         if kernel is None:
             kernel = self.kernel
@@ -227,12 +235,14 @@ class CodeGenerationState(object):
         if implemented_data_info is None:
             implemented_data_info = self.implemented_data_info
 
+        if vectorization_info is None:
+            vectorization_info = self.vectorization_info
+
+        insn_was_not_vectorizable = self.insn_was_not_vectorizable
         if vectorization_info is False:
+            insn_was_not_vectorizable = True
             vectorization_info = None
 
-        elif vectorization_info is None:
-            vectorization_info = self.vectorization_info
-
         if is_generating_device_code is None:
             is_generating_device_code = self.is_generating_device_code
 
@@ -247,7 +257,8 @@ class CodeGenerationState(object):
                 implemented_data_info=implemented_data_info,
                 implemented_domain=implemented_domain or self.implemented_domain,
                 implemented_predicates=(
-                    implemented_predicates or self.implemented_predicates),
+                    (implemented_predicates or self.implemented_predicates) -
+                    removed_predicates),
                 seen_dtypes=self.seen_dtypes,
                 seen_functions=self.seen_functions,
                 seen_atomic_dtypes=self.seen_atomic_dtypes,
@@ -257,7 +268,8 @@ class CodeGenerationState(object):
                 var_name_generator=self.var_name_generator,
                 is_generating_device_code=is_generating_device_code,
                 gen_program_name=gen_program_name,
-                schedule_index_end=schedule_index_end)
+                schedule_index_end=schedule_index_end,
+                insn_was_not_vectorizable=insn_was_not_vectorizable)
 
     def copy_and_assign(self, name, value):
         """Make a copy of self with variable *name* fixed to *value*."""
@@ -306,7 +318,7 @@ class CodeGenerationState(object):
         return self.copy_and_assign(iname, expr).copy(
                 implemented_domain=new_impl_domain)
 
-    def try_vectorized(self, what, func):
+    def try_vectorized(self, what, func, vector_kwargs={}):
         """If *self* is in a vectorizing state (:attr:`vectorization_info` is
         not None), tries to call func (which must be a callable accepting a
         single :class:`CodeGenerationState` argument). If this fails with
@@ -321,9 +333,9 @@ class CodeGenerationState(object):
             return func(self)
 
         try:
-            return func(self)
+            return func(self, **vector_kwargs)
         except Unvectorizable as e:
-            warn(self.kernel, "vectorize_failed",
+            warn_with_kernel(self.kernel, "vectorize_failed",
                     "Vectorization of '%s' failed because '%s'"
                     % (what, e))
 
@@ -332,7 +344,11 @@ class CodeGenerationState(object):
     def unvectorize(self, func):
         vinf = self.vectorization_info
         result = []
-        novec_self = self.copy(vectorization_info=False)
+        novec_self = self.copy(
+            vectorization_info=False,
+            # we must clear the implemented predicates, as they may have been
+            # generated as vector conditionals, and no longer be valide
+            removed_predicates=self.implemented_predicates)
 
         for i in range(vinf.length):
             idx_aff = isl.Aff.zero_on_domain(vinf.space.params()) + i
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index e1520a82ed69fa2aed729d9b1d849a78d658c4e1..31d0cbbf72a1373404231062b5dd6e8520577b87 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -510,11 +510,12 @@ def build_loop_nest(codegen_state, schedule_index):
                                 pred_chk for pred_chk in pred_checks]
 
                     prev_result = prev_gen_code(inner_codegen_state)
-
-                    return [wrap_in_if(
-                        inner_codegen_state,
-                        condition_exprs,
-                        merge_codegen_results(codegen_state, prev_result))]
+                    inner = merge_codegen_results(codegen_state, prev_result)
+                    return [new_codegen_state.try_vectorized(
+                        inner.current_ast(inner_codegen_state),
+                        lambda ics, **kwargs: wrap_in_if(
+                            ics, condition_exprs, inner, **kwargs),
+                        vector_kwargs={'is_vectorized': True})]
 
                 cannot_vectorize = False
                 if new_codegen_state.vectorization_info is not None:
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index e590502fb5813af0a820d45228de8e11c35a46c8..e370eef674d48adc7c410ce556c479c10ca8a8e9 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -64,7 +64,7 @@ def to_codegen_result(
         from pymbolic.primitives import LogicalAnd
         from pymbolic.mapper.stringifier import PREC_NONE
         ast = codegen_state.ast_builder.emit_if(
-                codegen_state.expression_to_code_mapper(
+                lambda: codegen_state.expression_to_code_mapper(
                     LogicalAnd(tuple(condition_exprs)), PREC_NONE),
                 ast)
 
diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index 4318ad71c1b16deeaac98f8408d5ca82f2de1714..eb8586f0dd80b839a3d346f181214c9c05579ddd 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -88,6 +88,11 @@ class CodeGenerationResult(ImmutableRecord):
 
         a list of :class:`loopy.codegen.ImplementedDataInfo` objects.
         Only added at the very end of code generation.
+
+    .. attribute:: vectorize_failed
+
+        If True, the currently generated instructions are in the unrolled failed
+        vectorization state (i.e., 'unvectorize')
     """
 
     @staticmethod
@@ -100,12 +105,12 @@ class CodeGenerationResult(ImmutableRecord):
         if codegen_state.is_generating_device_code:
             kwargs = {
                     "host_program": None,
-                    "device_programs": [prg],
+                    "device_programs": [prg]
                     }
         else:
             kwargs = {
                     "host_program": prg,
-                    "device_programs": [],
+                    "device_programs": []
                     }
 
         return CodeGenerationResult(
@@ -254,17 +259,126 @@ def merge_codegen_results(codegen_state, elements, collapse=True):
                 **kwargs))
 
 
-def wrap_in_if(codegen_state, condition_exprs, inner):
+def wrap_in_if(codegen_state, condition_exprs, inner, is_vectorized=False):
+    """
+    :param:`is_vectorized` indicates whether the generated AST was successfully
+        vectorized, or whether it was fed through unvectorize
+    """
     if condition_exprs:
         from pymbolic.primitives import LogicalAnd
         from pymbolic.mapper.stringifier import PREC_NONE
         cur_ast = inner.current_ast(codegen_state)
+        method = codegen_state.ast_builder.emit_if
+
+        def condition_mapper(ast=None, type_context=None, needed_dtype=None,
+                             condition=None):
+            if condition is not None:
+                # explicit vectorization override
+                pass
+            else:
+                condition = LogicalAnd(tuple(condition_exprs))
+            return codegen_state.expression_to_code_mapper(
+                    condition, PREC_NONE,
+                    type_context=type_context, needed_dtype=needed_dtype)
+        mapper = condition_mapper
+
+        if codegen_state.vectorization_info is not None and is_vectorized:
+            from loopy.symbolic import get_dependencies
+            from loopy.kernel.array import VectorArrayDimTag
+            from loopy.kernel.data import ValueArg
+
+            vec_iname = codegen_state.vectorization_info.iname
+
+            # precalculate vector arrays / temporaries
+            knl = codegen_state.kernel
+            vec_arys = set([x.name for x in knl.args + list(
+                knl.temporary_variables.values())
+                    if not isinstance(x, ValueArg) and any(
+                    isinstance(dt, VectorArrayDimTag)
+                    for dt in x.dim_tags)])
+
+            def check_vec_dep(condition):
+                deps = get_dependencies(condition)
+                # check conditions for explicit vector iname dependecies
+                if len(deps & set([vec_iname])):
+                    return True
+                # check for vector temporaries / arrays in conditional
+                if len(deps & vec_arys):
+                    return True
+
+            if any(check_vec_dep(cond) for cond in condition_exprs):
+                # condition directly involves a vector array or iname
+
+                def condition_mapper_wrapper(ast=None):
+                    if ast is None:
+                        # default case for printing
+                        return condition_mapper()
+
+                    # get the default condition to check for vectorizability
+                    check = condition_mapper()
+
+                    # get LHS dtype for (potential) casting of condition
+                    from loopy.expression import dtype_to_type_context
+                    lhs_dtype = codegen_state.expression_to_code_mapper.infer_type(
+                        ast.lvalue.expr)
+                    if not lhs_dtype.is_integral():
+                        # in OpenCL, the dtype of the conditional in a select call
+                        # must be an integer of the same 'bitness' as the dtype of
+                        # the conditional (https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/select.html)  # noqa
+                        # (e.g., float64 -> int64)
+                        from loopy.types import to_loopy_type
+                        import numpy as np
+                        lhs_dtype = to_loopy_type(
+                            np.dtype('i%d' % lhs_dtype.itemsize),
+                            lhs_dtype.target)
+                    type_context = dtype_to_type_context(codegen_state.kernel.target,
+                        lhs_dtype)
+
+                    from loopy.symbolic import VectorTypeCast
+                    from loopy.types import to_loopy_type
+                    from pymbolic.primitives import Variable
+                    from pymbolic.mapper.substitutor import substitute
+                    import numpy as np
+                    kwargs = {}
+                    deps = set()
+                    try:
+                        for c in check.expr.children:
+                            deps |= get_dependencies(c)
+
+                        if deps & set([vec_iname]):
+                            # we have to insert our own temporary version of the
+                            # vector iname here
+                            # get the vector size
+                            size = codegen_state.vectorization_info.length
+                            # determine the dtype
+                            np_dtype = np.dtype('i%d' % lhs_dtype.itemsize)
+                            dtype = codegen_state.kernel.target.\
+                                get_dtype_registry().dtype_to_ctype(
+                                    to_loopy_type(np_dtype,
+                                    target=codegen_state.kernel.target))
+                            # get the string form
+                            name = '%s%d' % (dtype, size)
+                            # next, get the base of a vector temporary
+                            init = range(size)
+                            # finally, put in a vextor typecast
+                            temp_iname = VectorTypeCast(np_dtype, init, name)
+                            kwargs['condition'] = substitute(
+                                check.expr, {Variable(vec_iname): temp_iname})
+
+                    except (AttributeError, TypeError):
+                        pass
+
+                    return condition_mapper(
+                        type_context=type_context, needed_dtype=lhs_dtype,
+                        **kwargs)
+
+                # mark as vector predicates
+                method = codegen_state.ast_builder.emit_vector_if
+                mapper = condition_mapper_wrapper
+
         return inner.with_new_ast(
                 codegen_state,
-                codegen_state.ast_builder.emit_if(
-                    codegen_state.expression_to_code_mapper(
-                        LogicalAnd(tuple(condition_exprs)), PREC_NONE),
-                    cur_ast))
+                method(mapper, cur_ast))
 
     return inner
 
diff --git a/loopy/expression.py b/loopy/expression.py
index 3269bc09f064f57857eaa5218c8370383e0f735e..06fe3bb06b1071bb6e8a1a80b861197d9eb0ed5f 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -63,6 +63,30 @@ class VectorizabilityChecker(RecursiveMapper):
     .. attribute:: vec_iname
     """
 
+    # this is a simple list of math functions from OpenCL-1.2
+    # https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/mathFunctions.html
+    # this could be expanded / moved to it's own target specific VecCheck if
+    # necessary
+    functions = """acos    acosh   acospi  asin
+    asinh   asinpi  atan    atan2
+    atanh   atanpi  atan2pi cbrt
+    ceil    copysign    cos cosh
+    cospi   erfc    erf exp
+    exp2    exp10   expm1   fabs
+    fdim    floor   fma fmax
+    fmin    fmod    fract   frexp
+    hypot   ilogb   ldexp   lgamma
+    lgamma_r    log log2    log10
+    log1p   logb    mad maxmag
+    minmag  modf    nan nextafter
+    pow pown    powr    remainder
+    remquo  rint    rootn   round
+    rsqrt   sin sincos  sinh
+    sinpi   sqrt    tan tanh
+    tanpi   tgamma  trunc"""
+
+    functions = [x.strip() for x in functions.split() if x.strip()]
+
     def __init__(self, kernel, vec_iname, vec_iname_length):
         self.kernel = kernel
         self.vec_iname = vec_iname
@@ -75,7 +99,7 @@ class VectorizabilityChecker(RecursiveMapper):
         return reduce(and_, vectorizabilities)
 
     def map_sum(self, expr):
-        return any(self.rec(child) for child in expr.children)
+        return any([self.rec(child) for child in expr.children])
 
     map_product = map_sum
 
@@ -84,6 +108,16 @@ class VectorizabilityChecker(RecursiveMapper):
                 or
                 self.rec(expr.denominator))
 
+    map_remainder = map_quotient
+
+    def map_floor_div(self, expr):
+        """
+        (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b)
+        """
+        a, b = expr.numerator, expr.denominator
+        return self.rec(a) and self.rec(a.lt(0)) and self.rec(b - 1) and \
+            self.rec((a - (b - 1)) / b) and self.rec(a / b)
+
     def map_linear_subscript(self, expr):
         return False
 
@@ -93,10 +127,54 @@ class VectorizabilityChecker(RecursiveMapper):
         rec_pars = [
                 self.rec(child) for child in expr.parameters]
         if any(rec_pars):
-            raise Unvectorizable("fucntion calls cannot yet be vectorized")
+            if str(expr.function) not in VectorizabilityChecker.functions:
+                return Unvectorizable(
+                    'Function {} is not known to be vectorizable'.format(
+                        str(expr.function)))
+            return True
 
         return False
 
+    @staticmethod
+    def compile_time_constants(kernel, vec_iname):
+        """
+        Returns a dictionary of (non-vector) inames and temporary variables whose
+        value is known at "compile" time. These are used (in combination with a
+        codegen state's variable substitution map) to simplifying access expressions
+        in :func:`get_access_info`.
+
+        Note: inames are mapped to the :class:`Variable` version of themselves,
+              while temporary variables are mapped to their integer value
+
+        .. parameter:: kernel
+            The kernel to check
+        .. parameter:: vec_iname
+            the vector iname
+
+        """
+
+        # determine allowed symbols as non-vector inames
+        from pymbolic.primitives import Variable
+        allowed_symbols = dict((sym, Variable(sym)) for sym in kernel.all_inames()
+                               if sym != vec_iname)
+        from loopy.kernel.instruction import Assignment
+        from loopy.tools import is_integer
+        from six import iteritems
+
+        # and compile time integer temporaries
+        compile_time_assign = dict((str(insn.assignee), insn.expression)
+                                   for insn in kernel.instructions if
+                                   isinstance(insn, Assignment) and is_integer(
+                                   insn.expression))
+        allowed_symbols.update(
+            dict((sym, compile_time_assign[sym]) for sym, var in iteritems(
+                    kernel.temporary_variables)
+                # temporary variables w/ no initializer, no shape
+                if var.initializer is None and not var.shape
+                # compile time integers
+                and sym in compile_time_assign))
+        return allowed_symbols
+
     def map_subscript(self, expr):
         name = expr.aggregate.name
 
@@ -114,29 +192,45 @@ class VectorizabilityChecker(RecursiveMapper):
 
         index = expr.index_tuple
 
-        from loopy.symbolic import get_dependencies
+        from loopy.symbolic import get_dependencies, DependencyMapper
         from loopy.kernel.array import VectorArrayDimTag
-        from pymbolic.primitives import Variable
 
         possible = None
         for i in range(len(var.shape)):
-            if (
-                    isinstance(var.dim_tags[i], VectorArrayDimTag)
-                    and isinstance(index[i], Variable)
-                    and index[i].name == self.vec_iname):
+            dep_mapper = DependencyMapper(composite_leaves=False)
+            deps = dep_mapper(index[i])
+            # if we're on the vector index
+            if isinstance(var.dim_tags[i], VectorArrayDimTag):
                 if var.shape[i] != self.vec_iname_length:
                     raise Unvectorizable("vector length was mismatched")
-
                 if possible is None:
-                    possible = True
-
-            else:
-                if self.vec_iname in get_dependencies(index[i]):
-                    raise Unvectorizable("vectorizing iname '%s' occurs in "
-                            "unvectorized subscript axis %d (1-based) of "
-                            "expression '%s'"
-                            % (self.vec_iname, i+1, expr))
-                    break
+                    possible = self.vec_iname in [str(x) for x in deps]
+            # or, if not vector index, and vector iname is present
+            elif self.vec_iname in set(x.name for x in deps):
+                # check whether we can simplify out the vector iname
+                context = dict((str(x), x) for x in deps if x.name != self.vec_iname)
+                allowed_symbols = self.compile_time_constants(
+                    self.kernel, self.vec_iname)
+
+                from pymbolic import substitute
+                from pymbolic.mapper.evaluator import UnknownVariableError
+                from loopy.tools import is_integer
+                for veci in range(self.vec_iname_length):
+                    ncontext = context.copy()
+                    ncontext[self.vec_iname] = veci
+                    try:
+                        idi = substitute(index[i], ncontext)
+                        if not is_integer(idi) and not all(
+                                x in allowed_symbols
+                                for x in get_dependencies(idi)):
+                            raise Unvectorizable(
+                                "vectorizing iname '%s' occurs in "
+                                "unvectorized subscript axis %d (1-based) of "
+                                "expression '%s', and could not be simplified"
+                                "to compile-time constants."
+                                % (self.vec_iname, i+1, expr))
+                    except UnknownVariableError:
+                        break
 
         return bool(possible)
 
@@ -160,16 +254,31 @@ class VectorizabilityChecker(RecursiveMapper):
         return False
 
     def map_comparison(self, expr):
-        # FIXME: These actually can be vectorized:
         # https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/relationalFunctions.html
 
+        # even better for OpenCL <, <=, >, >=, !=, == are all vectorizable by default
+        # (see: sec 6.3.d-6.d.3 in OpenCL-1.2 docs)
+
+        if expr.operator in ["<", "<=", ">", ">=", "!=", "=="]:
+            return any([self.rec(x) for x in [expr.left, expr.right]])
+
         raise Unvectorizable()
 
     def map_logical_not(self, expr):
-        raise Unvectorizable()
+        # 6.3.h in OpenCL-1.2 docs
+        return self.rec(expr.child)
+
+    def map_logical_and(self, expr):
+        # 6.3.h in OpenCL-1.2 docs
+        return any(self.rec(x) for x in expr.children)
+
+    map_logical_or = map_logical_and
 
-    map_logical_and = map_logical_not
-    map_logical_or = map_logical_not
+    # sec 6.3.f in OpenCL-1.2 docs
+    map_bitwise_not = map_logical_not
+    map_bitwise_or = map_logical_and
+    map_bitwise_xor = map_logical_and
+    map_bitwise_and = map_logical_and
 
     def map_reduction(self, expr):
         # FIXME: Do this more carefully
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 3588f38af13479b127208c25735f1046eaa82706..f7025f6eebb9fd772574e43268a77ede3436fa67 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1213,33 +1213,58 @@ class AccessInfo(ImmutableRecord):
     """
 
 
-def get_access_info(target, ary, index, eval_expr, vectorization_info):
+def get_access_info(target, ary, index, var_subst_map, vectorization_info):
     """
     :arg ary: an object of type :class:`ArrayBase`
     :arg index: a tuple of indices representing a subscript into ary
+    :arg var_subst_map: a context of variable substitutions from the calling codegen
+        state and potentially other compile-time "constants" (inames and
+        integer temporaries w/ known values), used in detection of loads / shuffles
     :arg vectorization_info: an instance of :class:`loopy.codegen.VectorizationInfo`,
         or *None*.
     """
 
     import loopy as lp
     from pymbolic import var
+    from loopy.codegen import Unvectorizable
+    from loopy.symbolic import get_dependencies
 
-    def eval_expr_assert_integer_constant(i, expr):
+    def eval_expr_assert_constant(i, expr, kwargs):
         from pymbolic.mapper.evaluator import UnknownVariableError
+        # determine error type -- if vectorization_info is None, we're in the
+        # unvec fallback (and should raise a LoopyError)
+        # if vectorization_info is 'True', we should raise an Unvectorizable
+        # on failure
+        error_type = LoopyError if vectorization_info is None else Unvectorizable
+        from pymbolic import evaluate
         try:
-            result = eval_expr(expr)
+            result = evaluate(expr, kwargs)
         except UnknownVariableError as e:
-            raise LoopyError("When trying to index the array '%s' along axis "
+            if vectorization_info:
+                # failed vectorization
+                raise Unvectorizable(
+                    "When trying to vectorize the array '%s' along axis "
                     "%d (tagged '%s'), the index was not a compile-time "
                     "constant (but it has to be in order for code to be "
-                    "generated). You likely want to unroll the iname(s) '%s'."
+                    "generated). You likely want to unroll the iname(s) '%s'"
                     % (ary.name, i, ary.dim_tags[i], str(e)))
+            else:
+                raise LoopyError(
+                    "When trying to unroll the array '%s' along axis "
+                    "%d (tagged '%s'), the index was not an unrollable-iname "
+                    "or constant (but it has to be in order for code to be "
+                    "generated). You likely want to unroll/change array index(s)"
+                    " '%s'" % (ary.name, i, ary.dim_tags[i], str(e)))
 
         if not is_integer(result):
-            raise LoopyError("subscript '%s[%s]' has non-constant "
+            # try to simplify further
+            from loopy.isl_helpers import simplify_via_aff
+            result = simplify_via_aff(result)
+
+        if any([x not in var_subst_map for x in get_dependencies(result)]):
+            raise error_type("subscript '%s[%s]' has non-constant "
                     "index for separate-array axis %d (0-based)" % (
                         ary.name, index, i))
-
         return result
 
     def apply_offset(sub):
@@ -1290,11 +1315,50 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
 
     for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)):
         if isinstance(dim_tag, SeparateArrayArrayDimTag):
-            idx = eval_expr_assert_integer_constant(i, idx)
+            idx = eval_expr_assert_constant(i, idx, var_subst_map)
             array_name += "_s%d" % idx
 
     # }}}
 
+    def __get_simplified(arr):
+        from loopy.isl_helpers import simplify_via_aff
+        return [simplify_via_aff(arr[i]) for i in range(len(arr))]
+
+    def is_contiguous(arr):
+        from loopy.isl_helpers import simplify_via_aff
+        from functools import cmp_to_key
+        if not len(arr) or len(arr) != vector_size:
+            return False
+        try:
+            sarr = sorted(arr)
+        except TypeError:
+            # tried to sort a pymbolic expression, try w/ comparison sort
+            sarr = sorted(arr, key=cmp_to_key(
+                lambda x, y: simplify_via_aff(x - y) > 0))
+        return simplify_via_aff(sarr[-1] - sarr[0] + 1) == vector_size
+
+    def is_monotonic(arr):
+        if not len(arr):
+            return False
+        signs = __get_simplified(
+            [arr[i + 1] - arr[i] for i in range(len(arr) - 1)])
+        # check if array is monotonic increasing / decreasing
+        signs = [x < 0 for x in signs]
+        return all(s == signs[0] for s in signs[1:])
+
+    def run_over_vecrange(i, idx, base_subs):
+        evaled = []
+        for veci in range(vector_size):
+            try:
+                subsi = base_subs.copy()
+                subsi[vectorization_info.iname] = veci
+                evaled.append(eval_expr_assert_constant(i, idx, subsi))
+            except Unvectorizable:
+                pass
+        return __get_simplified(evaled)
+
+    vec_op_type = None
+
     # {{{ process remaining dim tags
 
     for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)):
@@ -1311,6 +1375,27 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
             elif stride is lp.auto:
                 stride = var(array_name + "_stride%d" % i)
 
+            if vectorization_info and \
+                    vectorization_info.iname in get_dependencies(idx):
+                # need to determine here whether the vector iname is aligned with
+                # the vector size -> shuffle, or unaligned -> load
+                evaled = run_over_vecrange(i, idx, var_subst_map)
+                if is_monotonic(evaled):
+                    vec_op_type = 'shuffle' if all(x == evaled[0] for x in evaled) \
+                        else 'load'
+                else:
+                    raise Unvectorizable('Vectorized iname %s present in '
+                        'unvectorized axis %s (1-based) access "%s", and not '
+                        'simplifiable to compile-time contigous access' % (
+                            vectorization_info.iname, i + 1, idx))
+            elif vectorization_info:
+                vec_op_type = 'shuffle'  # independent of vector iname
+
+            # update vector operation type if necessary
+            if vector_index is not None and isinstance(vector_index, tuple):
+                assert vector_index[0] is None  # pylint: disable=E1136
+                vector_index = (vec_op_type, vector_index[1])  # pylint: disable=E1136; # noqa
+
             subscripts[dim_tag.target_axis] += (stride // vector_size)*idx
 
         elif isinstance(dim_tag, SeparateArrayArrayDimTag):
@@ -1318,18 +1403,29 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
 
         elif isinstance(dim_tag, VectorArrayDimTag):
             from pymbolic.primitives import Variable
-            if (vectorization_info is not None
-                    and isinstance(index[i], Variable)
+            if (vectorization_info and isinstance(index[i], Variable)
                     and index[i].name == vectorization_info.iname):
                 # We'll do absolutely nothing here, which will result
                 # in the vector being returned.
                 pass
 
             else:
-                idx = eval_expr_assert_integer_constant(i, idx)
-
-                assert vector_index is None
-                vector_index = idx
+                if vectorization_info:
+                    # check dependencies
+                    deps = get_dependencies(idx) - set(var_subst_map.keys())
+                    if len(deps) == 1 and vectorization_info.iname in deps:
+                        # we depend only on the vectorized iname -- see if we can
+                        # simplify to a load / shuffle
+                        evaled = run_over_vecrange(i, idx, var_subst_map)
+                        if is_contiguous(evaled):
+                            # we can generate a load or shuffle depending on the
+                            # alignment
+                            vector_index = (vec_op_type, evaled)
+
+                if vector_index is None:
+                    # if we haven't generated a load of shuffle...
+                    idx = eval_expr_assert_constant(i, idx, var_subst_map)
+                    vector_index = idx
 
         else:
             raise LoopyError("unsupported array dim implementation tag '%s' "
@@ -1345,6 +1441,9 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
 
         subscripts[0] = apply_offset(subscripts[0])
 
+    if isinstance(vector_index, tuple):
+        assert vector_index[0] is not None, 'Unknown vectorization type'
+
     return AccessInfo(
             array_name=array_name,
             vector_index=vector_index,
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index dd68c950e04e29fc99c456412d9dc4a53dbc61b2..d3da73119ad11fa7be56169d3c4eab14aa3ae903 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -502,6 +502,7 @@ class TemporaryVariable(ArrayBase):
         the temporary as a ``restrict`` const pointer to the base storage
         memory location. If *True*, the restrict part is omitted on this
         declaration.
+
     """
 
     min_target_axes = 0
@@ -514,7 +515,7 @@ class TemporaryVariable(ArrayBase):
             "base_storage",
             "initializer",
             "read_only",
-            "_base_storage_access_may_be_aliasing",
+            "_base_storage_access_may_be_aliasing"
             ]
 
     def __init__(self, name, dtype=None, shape=(), address_space=None,
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index f5cf07b0e1d62212ce36edb48f47eb7de7d31451..ccbdbaf49d7d84fdb4ee056ae57a50eb0c5aa26c 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -75,6 +75,8 @@ class IdentityMapperMixin(object):
     def map_array_literal(self, expr, *args):
         return type(expr)(tuple(self.rec(ch, *args) for ch in expr.children))
 
+    map_vector_literal = map_array_literal
+
     def map_group_hw_index(self, expr, *args):
         return expr
 
@@ -106,9 +108,11 @@ class IdentityMapperMixin(object):
         return expr
 
     def map_type_annotation(self, expr, *args):
-        return type(expr)(expr.type, self.rec(expr.child))
+        kwargs = {}
+        return type(expr)(expr.type, self.rec(expr.child), **kwargs)
 
     map_type_cast = map_type_annotation
+    map_vector_type_cast = map_type_annotation
 
     map_linear_subscript = IdentityMapperBase.map_subscript
 
@@ -139,6 +143,8 @@ class WalkMapper(WalkMapperBase):
         for ch in expr.children:
             self.rec(ch, *args)
 
+    map_vector_literal = map_array_literal
+
     def map_group_hw_index(self, expr, *args):
         self.visit(expr)
 
@@ -156,6 +162,7 @@ class WalkMapper(WalkMapperBase):
             return
         self.rec(expr.child, *args)
 
+    map_vector_type_cast = map_type_cast
     map_tagged_variable = WalkMapperBase.map_variable
 
     def map_loopy_function_identifier(self, expr, *args):
@@ -195,6 +202,10 @@ class StringifyMapper(StringifyMapperBase):
     def map_array_literal(self, expr, *args):
         return "{%s}" % ", ".join(self.rec(ch) for ch in expr.children)
 
+    def map_vector_literal(self, expr, *args):
+        from pymbolic.mapper.stringifier import PREC_NONE
+        return "(%s)" % ", ".join(self.rec(ch, PREC_NONE) for ch in expr.children)
+
     def map_group_hw_index(self, expr, enclosing_prec):
         return "grp.%d" % expr.index
 
@@ -232,6 +243,11 @@ class StringifyMapper(StringifyMapperBase):
         from pymbolic.mapper.stringifier import PREC_NONE
         return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE))
 
+    def map_vector_type_cast(self, expr, enclosing_prec):
+        from pymbolic.mapper.stringifier import PREC_NONE
+        return "cast(%s, %s)" % (repr(expr.type_name), self.rec(
+            expr.child, PREC_NONE))
+
 
 class UnidirectionalUnifier(UnidirectionalUnifierBase):
     def map_reduction(self, expr, other, unis):
@@ -289,6 +305,14 @@ class DependencyMapper(DependencyMapperBase):
     def map_type_cast(self, expr):
         return self.rec(expr.child)
 
+    map_vector_type_cast = map_type_cast
+
+    def map_literal(self, expr):
+        return set()
+
+    def map_vector_literal(self, expr):
+        return self.combine(self.rec(child) for child in expr.children)
+
 
 class SubstitutionRuleExpander(IdentityMapper):
     def __init__(self, rules):
@@ -365,6 +389,25 @@ class ArrayLiteral(p.Leaf):
     mapper_method = "map_array_literal"
 
 
+class VectorLiteral(p.Leaf):
+    """An vector dtype literal."""
+
+    # Currently only in conjunction with the VectorTypeCast
+
+    def __init__(self, children):
+        self.children = children
+
+    def stringifier(self):
+        return StringifyMapper
+
+    def __getinitargs__(self):
+        return (self.children,)
+
+    init_arg_names = ("children",)
+
+    mapper_method = "map_vector_literal"
+
+
 class HardwareAxisIndex(p.Leaf):
     def __init__(self, axis):
         self.axis = axis
@@ -475,6 +518,62 @@ class TypeCast(p.Expression):
     mapper_method = intern("map_type_cast")
 
 
+class VectorTypeCast(p.Expression):
+    """
+    A workaround for casts of vector temporaries, e.g.:
+        (int4)(0, 1, 2, 3)
+
+    Useful for inserting temporaries into expressions to avoid unvectorizable code
+
+    .. attribute:: type
+
+        The (non-vector) numpy type to cast to. e.g., if using 'int4', the type
+        would be np.int32
+
+    .. attribute:: child
+
+        The :class:`VectorLiteral` initializer list to convert to via typecast
+
+    .. attribute:: type_name
+
+        The stringified type (including vector size), e.g., 'int4'
+    """
+
+    def __init__(self, type, init, type_name):
+        super(VectorTypeCast, self).__init__()
+
+        from loopy.types import to_loopy_type, NumpyType
+        type = to_loopy_type(type)
+
+        if (not isinstance(type, NumpyType)
+                or not issubclass(type.dtype.type, np.number)):
+            from loopy.diagnostic import LoopyError
+            raise LoopyError("TypeCast only supports numerical numpy types, "
+                    "not '%s'" % type)
+
+        # We're storing the type as a name for now to avoid
+        # numpy pickling bug madness. (see loopy.types)
+        self.type_name = type_name
+        self.child = VectorLiteral(tuple(s for s in init))
+        self._base_type = type.dtype
+
+    @property
+    def type(self):
+        from loopy.types import NumpyType
+        return NumpyType(self._base_type)
+
+    # init_arg_names is a misnomer--they're attribute names used for pickling.
+    init_arg_names = ("type_name", "child", "_base_type")
+
+    def __getinitargs__(self):
+        return (self.type_name, self.child, self._base_type)
+
+    def stringifier(self):
+        return StringifyMapper
+
+    mapper_method = intern("map_vector_type_cast")
+
+
 class TaggedVariable(p.Variable):
     """This is an identifier with a tag, such as 'matrix$one', where
     'one' identifies this specific use of the identifier. This mechanism
@@ -1740,6 +1839,8 @@ class BatchedAccessRangeMapper(WalkMapper):
     def map_type_cast(self, expr, inames):
         return self.rec(expr.child, inames)
 
+    map_vector_type_cast = map_type_cast
+
 
 class AccessRangeMapper(object):
     """**IMPORTANT**
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index a81354e2fd7b52ba514af936441c7a2d980c77b5..879f9f1f34092add33dca026d0cba39ab299e090 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -220,7 +220,10 @@ class ASTBuilderBase(object):
     def can_implement_conditionals(self):
         return False
 
-    def emit_if(self, condition_str, ast):
+    def emit_if(self, condition_mapper, ast):
+        raise NotImplementedError()
+
+    def emit_vector_if(self, condition_mapper, ast):
         raise NotImplementedError()
 
     def emit_initializer(self, codegen_state, dtype, name, val_str, is_const):
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 17dd9dc1034f4572e2bcf1d3abc806354c73336e..38a44d5727d4af321bc50a27fc9c56031f082cbe 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -967,9 +967,9 @@ class CASTBuilder(ASTBuilderBase):
     def can_implement_conditionals(self):
         return True
 
-    def emit_if(self, condition_str, ast):
+    def emit_if(self, condition_mapper, ast):
         from cgen import If
-        return If(condition_str, ast)
+        return If(condition_mapper(), ast)
 
     # }}}
 
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 8ef921e447bf10d85ac60460f904d528ac64da19..934b9b78db1b3fd6421645c4d1ca6a5d8c250249 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -183,15 +183,22 @@ class ExpressionToCExpressionMapper(IdentityMapper):
         ary = self.find_array(expr)
 
         from loopy.kernel.array import get_access_info
-        from pymbolic import evaluate
 
         from loopy.symbolic import simplify_using_aff
         index_tuple = tuple(
                 simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple)
 
+        from loopy.expression import VectorizabilityChecker
+        var_subst_map = self.codegen_state.var_subst_map.copy()
+        if self.codegen_state.vectorization_info:
+            ctc_iname = self.codegen_state.vectorization_info.iname
+            ctc = VectorizabilityChecker.compile_time_constants(
+                    self.codegen_state.kernel,
+                    ctc_iname)
+            var_subst_map.update(ctc)
+
         access_info = get_access_info(self.kernel.target, ary, index_tuple,
-                lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
-                self.codegen_state.vectorization_info)
+                var_subst_map, self.codegen_state.vectorization_info)
 
         from loopy.kernel.data import (
                 ImageArg, ArrayArg, TemporaryVariable, ConstantArg)
@@ -252,8 +259,34 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                             self.kernel, self.rec(subscript, 'i')))
 
             if access_info.vector_index is not None:
-                return self.codegen_state.ast_builder.add_vector_access(
-                    result, access_info.vector_index)
+                if isinstance(access_info.vector_index, tuple):
+                    # check for specific vector access nodes
+                    try:
+                        method, ind = access_info.vector_index
+                        method = getattr(self.codegen_state.ast_builder,
+                                          'add_vector_%s' % method)
+                        return method(result,
+                                      self.codegen_state.vectorization_info.iname,
+                                      ary, ind)
+                    except AttributeError:
+                        from loopy.codegen import Unvectorizable
+                        raise Unvectorizable('Target %s has no map node for '
+                            'method add_vector_%s' % (
+                                str(type(self.codegen_state.ast_builder)),
+                                method))
+
+                try:
+                    from loopy.tools import is_integer
+                    assert is_integer(access_info.vector_index)
+                    return self.codegen_state.ast_builder.add_vector_access(
+                        result, access_info.vector_index)
+                except AssertionError:
+                    from loopy.codegen import Unvectorizable
+                    raise Unvectorizable(
+                        "Cannot add vector access for non-integer vector addressing "
+                        "did you mean to tag iname '%s' as a vector index?" % (
+                            access_info.vector_index))
+
             else:
                 return result
 
@@ -358,6 +391,10 @@ class ExpressionToCExpressionMapper(IdentityMapper):
         cast = var("(%s)" % registry.dtype_to_ctype(expr.type))
         return cast(self.rec(expr.child, type_context))
 
+    def map_vector_type_cast(self, expr, type_context):
+        cast = var("(%s)" % expr.type_name)
+        return cast(self.rec(expr.child, type_context))
+
     def map_constant(self, expr, type_context):
         if isinstance(expr, (complex, np.complexfloating)):
             try:
@@ -411,9 +448,15 @@ class ExpressionToCExpressionMapper(IdentityMapper):
             ary = self.find_array(arg)
 
             from loopy.kernel.array import get_access_info
-            from pymbolic import evaluate
+            var_subst_map = self.codegen_state.var_subst_map.copy()
+            if self.codegen_state.vectorization_info:
+                from loopy.expression import VectorizabilityChecker
+                ctc = VectorizabilityChecker.compile_time_constants(
+                        self.codegen_state.kernel,
+                        self.codegen_state.vectorization_info.iname)
+                var_subst_map.update(ctc)
             access_info = get_access_info(self.kernel.target, ary, arg.index,
-                    lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
+                    self.codegen_state.var_subst_map.copy(),
                     self.codegen_state.vectorization_info)
 
             from loopy.kernel.data import ImageArg
@@ -907,6 +950,9 @@ class CExpressionToCodeMapper(RecursiveMapper):
     def map_array_literal(self, expr, enclosing_prec):
         return "{ %s }" % self.join_rec(", ", expr.children, PREC_NONE)
 
+    def map_vector_literal(self, expr, enclosing_prec):
+        return "( %s )" % self.join_rec(", ", expr.children, PREC_NONE)
+
 # }}}
 
 # vim: fdm=marker
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index cccee2301e44b16e2454bda5e98af7db7893c003..0e56d9fa69249fd801231102445c7970369cb55a 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -109,11 +109,17 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
             if lsize:
                 lsize, = lsize
                 from loopy.kernel.array import get_access_info
-                from pymbolic import evaluate
+
+                var_subst_map = self.codegen_state.var_subst_map.copy()
+                if self.codegen_state.vectorization_info:
+                    from loopy.expression import VectorizabilityChecker
+                    ctc = VectorizabilityChecker.compile_time_constants(
+                            self.codegen_state.kernel,
+                            self.codegen_state.vectorization_info.iname)
+                    var_subst_map.update(ctc)
 
                 access_info = get_access_info(self.kernel.target, ary, expr.index,
-                    lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
-                    self.codegen_state.vectorization_info)
+                    var_subst_map, self.codegen_state.vectorization_info)
 
                 subscript, = access_info.subscripts
                 result = var(access_info.array_name)[
@@ -397,15 +403,20 @@ class ISPCASTBuilder(CASTBuilder):
             ary = ecm.find_array(lhs)
 
             from loopy.kernel.array import get_access_info
-            from pymbolic import evaluate
-
             from loopy.symbolic import simplify_using_aff
             index_tuple = tuple(
                     simplify_using_aff(kernel, idx) for idx in lhs.index_tuple)
 
+            var_subst_map = codegen_state.var_subst_map.copy()
+            if codegen_state.vectorization_info:
+                from loopy.expression import VectorizabilityChecker
+                ctc = VectorizabilityChecker.compile_time_constants(
+                        codegen_state.kernel,
+                        codegen_state.vectorization_info.iname)
+                var_subst_map.update(ctc)
+
             access_info = get_access_info(kernel.target, ary, index_tuple,
-                    lambda expr: evaluate(expr, codegen_state.var_subst_map),
-                    codegen_state.vectorization_info)
+                    var_subst_map, codegen_state.vectorization_info)
 
             from loopy.kernel.data import ArrayArg, TemporaryVariable
 
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 8a6e5284258d864d19d7f1353ec9dfaaa7d72a9b..f976dbd0cc973098ab96841ca6051da7e04c055f 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -34,6 +34,7 @@ from loopy.types import NumpyType
 from loopy.target.c import DTypeRegistryWrapper, c_math_mangler
 from loopy.kernel.data import AddressSpace, CallMangleInfo
 from pymbolic import var
+from pymbolic.primitives import Call
 
 from functools import partial
 
@@ -148,8 +149,9 @@ _CL_SIMPLE_MULTI_ARG_FUNCTIONS = {
         }
 
 
-VECTOR_LITERAL_FUNCS = dict(
-        ("make_%s%d" % (name, count), (name, dtype, count))
+def get_vector_func(func, template):
+    return dict(
+        (template % dict(func=func, name=name, count=count), (name, dtype, count))
         for name, dtype in [
             ('char', np.int8),
             ('uchar', np.uint8),
@@ -166,6 +168,11 @@ VECTOR_LITERAL_FUNCS = dict(
         )
 
 
+VECTOR_LITERAL_FUNCS = get_vector_func('make', '%(func)s_%(name)s%(count)d')
+VECTOR_STORE_FUNCS = get_vector_func('vstore', '%(func)s%(count)d')
+VECTOR_LOAD_FUNCS = get_vector_func('vload', '%(func)s%(count)d')
+
+
 def opencl_function_mangler(kernel, name, arg_dtypes):
     if not isinstance(name, str):
         return None
@@ -220,6 +227,27 @@ def opencl_function_mangler(kernel, name, arg_dtypes):
                     NumpyType(dtype), count),),
                 arg_dtypes=(NumpyType(dtype),)*count)
 
+    if name in VECTOR_LOAD_FUNCS or name in VECTOR_STORE_FUNCS:
+        if name in VECTOR_LOAD_FUNCS:
+            load = True
+            _, dtype, count = VECTOR_LOAD_FUNCS[name]
+            result = (kernel.target.vector_dtype(NumpyType(dtype), count),)
+            args = (kernel.index_dtype, NumpyType(dtype))
+        else:
+            load = False
+            _, dtype, count = VECTOR_STORE_FUNCS[name]
+            result = tuple()
+            args = (kernel.target.vector_dtype(NumpyType(dtype), count),
+                    kernel.index_dtype, NumpyType(dtype))
+
+        if not ((count == 2 and load) or (count == 3 and not load)):
+            return None
+
+        return CallMangleInfo(
+                target_name=name,
+                result_dtypes=result,
+                arg_dtypes=args)
+
     return None
 
 # }}}
@@ -299,6 +327,54 @@ class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper):
     def map_local_hw_index(self, expr, type_context):
         return var("lid")(expr.axis)
 
+    def map_vector_literal(self, expr, type_context):
+        return var(', '.join('%s' % x for x in expr.children))
+
+    def map_comparison(self, expr, type_context):
+        from loopy.symbolic import get_dependencies
+        from loopy.kernel.data import VectorizeTag, filter_iname_tags_by_type
+        from six import iteritems
+
+        vec_inames = set([iname for iname, tags in
+                          iteritems(self.kernel.iname_to_tags)
+                          if filter_iname_tags_by_type(tags, VectorizeTag)])
+
+        if get_dependencies(expr) & vec_inames and \
+               self.codegen_state.insn_was_not_vectorizable:
+            from loopy.diagnostic import warn_with_kernel
+            warn_with_kernel(self.codegen_state.kernel,
+                             'unrolled_vector_iname_conditional',
+                             'Unrolled vector-loop iname detected in vector ' +
+                             'comparison; this may in unexpected truth-values.')
+
+        return super(ExpressionToOpenCLCExpressionMapper, self).map_comparison(
+            expr, type_context)
+
+    def wrap_in_typecast(self, actual_type, needed_dtype, s):
+        wrap = super(ExpressionToOpenCLCExpressionMapper, self).wrap_in_typecast(
+            actual_type, needed_dtype, s)
+        if self.codegen_state.vectorization_info is not None and (
+                actual_type != needed_dtype):
+            from loopy.symbolic import get_dependencies
+            from loopy.kernel.array import VectorArrayDimTag
+            rhs_deps = get_dependencies(s)
+
+            def is_vector(var):
+                return any(isinstance(x, VectorArrayDimTag) for x in var.dim_tags)
+            # if we have a vector-type on the RHS and the RHS dtype != LHS dtype,
+            # we need an explicit conversion
+            rhs_temp_vars = rhs_deps & set(self.kernel.temporary_variables.keys())
+            rhs_args = rhs_deps & set(self.kernel.arg_dict.keys())
+            if any(is_vector(self.kernel.temporary_variables[x])
+                   for x in rhs_temp_vars) or any(
+                   is_vector(self.kernel.arg_dict[x]) for x in rhs_args):
+                ctype = self.kernel.target.get_dtype_registry().dtype_to_ctype(
+                    needed_dtype)
+                vw = self.codegen_state.vectorization_info.length
+                # need to add an explicit conversion
+                return var("convert_%s%d" % (ctype, vw))(wrap)
+        return wrap
+
 # }}}
 
 
@@ -359,6 +435,67 @@ class OpenCLTarget(CTarget):
 
 # }}}
 
+# {{{ simple opencl function wrappers
+
+
+class VectorFunc(Call):
+    def __init__(self, function, parameters):
+        # check that function and parameters are variables
+        from pymbolic.primitives import Variable, Expression
+        if not isinstance(function, Variable):
+            function = var(function)
+        parameters = list(parameters)
+        for i, param in enumerate(parameters):
+            if not isinstance(param, (Variable, Expression)):
+                parameters[i] = var(str(param))
+        super(VectorFunc, self).__init__(function, tuple(parameters))
+
+
+class VectorStore(VectorFunc):
+    def __init__(self, vector_width, store, offset, array):
+        """
+        Represents a vstoren
+
+        :arg vector_width: the SIMD vector-width
+        :arg store: the data to store
+        :arg offset: the offset in the array
+        :arg array: the array to store the data in
+        """
+
+        name = 'vstore%d' % vector_width
+        super(VectorStore, self).__init__(name, (store, offset, array))
+
+
+class VectorLoad(VectorFunc):
+    def __init__(self, vector_width, offset, array):
+        """
+        Represents a vloadn
+
+        :arg vector_width: the SIMD vector-width
+        :arg offset: the offset in the array
+        :arg array: the array to store the data in
+        """
+
+        name = 'vload%d' % vector_width
+        super(VectorLoad, self).__init__(name, (offset, array))
+
+
+class VectorSelect(VectorFunc):
+    def __init__(self, select_if_true, select_if_false, condition):
+        """
+        Represents a vector-select
+
+        :arg select_if_true: the value to be chosen if select_on is true
+        :arg select_if_false: the value to be chosen if select_on is false
+        :arg condition: the conditional selection value
+        """
+
+        name = 'select'
+        super(VectorSelect, self).__init__(name, (
+            select_if_false, select_if_true, condition))
+
+# }}}
+
 
 # {{{ ast builder
 
@@ -437,9 +574,109 @@ class OpenCLCASTBuilder(CASTBuilder):
     def get_expression_to_c_expression_mapper(self, codegen_state):
         return ExpressionToOpenCLCExpressionMapper(codegen_state)
 
+    def emit_assignment(self, codegen_state, insn):
+        """
+        A wrapper around the base C-target emit_assignment, to handle explicit-SIMD
+        functions, such as selects, vstore's and vload's and shuffles
+        """
+
+        assignment = super(OpenCLCASTBuilder, self).emit_assignment(
+            codegen_state, insn)
+
+        # fix-up
+        try:
+            if isinstance(assignment.lvalue.expr, VectorLoad):
+                from cgen import Statement
+                # get vector width
+                func = str(assignment.lvalue.expr.function)
+                vw = int(func[func.index('vload') + len('vload'):])
+                # convert to vector store
+                store = VectorStore(vw, assignment.rvalue.expr,
+                                    *assignment.lvalue.expr.parameters)
+                # and to statement
+                assignment = Statement(str(store))
+        except AttributeError:
+            pass
+        return assignment
+
+    def emit_vector_if(self, condition_mapper, ast):
+        """
+        Emit's a vector select function
+        """
+
+        def vecify(assign):
+            try:
+                # treat it as an assignment
+                return Assign(str(assign.lvalue.expr), str(VectorSelect(
+                    assign.rvalue.expr, assign.lvalue.expr,
+                    condition_mapper(assign))))
+            except AttributeError:
+                return False
+
+        from cgen import Assign, Block
+        vec_if = vecify(ast)
+        if not vec_if:
+            try:
+                vec_if = []
+                for assign in ast.contents:
+                    vec_if.append(vecify(assign))
+                if any(not x for x in vec_if):
+                    # one 'assign' failed
+                    vec_if = False
+                else:
+                    vec_if = Block(vec_if)
+            except AttributeError:
+                vec_if = False
+        if not vec_if and isinstance(ast, Block):
+            import logging
+            logger = logging.getLogger(__name__)
+            logger.info('Cannot convert non-simple assign statement for instruction '
+                        '(%s) to vectorized conditional. '
+                        'Assuming that this is the result of a previous unvectorize '
+                        'call.')
+            return ast
+
+        return vec_if
+
     def add_vector_access(self, access_expr, index):
         # The 'int' avoids an 'L' suffix for long ints.
-        return access_expr.attr("s%s" % hex(int(index))[2:])
+        def __map(ind, use_prefix=True):
+            strmap = 's%s' if use_prefix else '%s'
+            start = 2
+            return strmap % hex(int(ind))[start:]
+        try:
+            lookup = ''
+            for i, ind in enumerate(index):
+                lookup += __map(ind, not i)
+        except TypeError:
+            # not iterable
+            lookup = __map(index)
+        return access_expr.attr(lookup)
+
+    def add_vector_shuffle(self, access_expr, vec_iname, array, index):
+        # this can simply call a vector access with the index list
+        return self.add_vector_access(access_expr, index)
+
+    def add_vector_load(self, access_expr, vec_iname, array, index):
+        from pymbolic import substitute
+        # get ctype for casting
+        ctype = str(array.get_arg_decl(
+            self, '', array.shape, array.dtype, False))
+        ctype = ctype[:ctype.rindex(array.name) - 1]
+        # and convert the vector access expression to a vector offset
+        # to do so, we substitute the vector iname -> 0 to eliminate it from the
+        # expression
+        offset = substitute(access_expr.index, {vec_iname: 0})
+        # try simplify
+        from pymbolic.mapper.evaluator import UnknownVariableError
+        try:
+            from loopy.isl_helpers import simplify_via_aff
+            offset = simplify_via_aff(offset)
+        except UnknownVariableError:
+            pass
+        # and cast / substitute in the calculated vector iname offset
+        cast_expr = '&((%s)%s)[%s]' % (ctype, array.name, index[0])
+        return VectorLoad(len(index), str(offset), str(cast_expr))
 
     def emit_barrier(self, synchronization_kind, mem_kind, comment):
         """
@@ -541,6 +778,17 @@ class OpenCLCASTBuilder(CASTBuilder):
         # FIXME: Could detect operations, generate atomic_{add,...} when
         # appropriate.
 
+        if codegen_state.vectorization_info is not None or \
+                codegen_state.insn_was_not_vectorizable:
+            # note - this check whether we've previously tried to vectorize and
+            # failed (in which case insn_was_not_vectorizable will be True) or
+            # whether vectorization_info is a valid :class:`VectorizationInfo`
+            #
+            # Both cases should fail (as we can't take the index of an unrolled
+            # atomic)
+            raise LoopyError('Atomic operators not implemented for '
+                             'explicit-SIMD vectorization')
+
         if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [
                 np.int32, np.int64, np.float32, np.float64]:
             from cgen import Block, DoWhile, Assign
diff --git a/loopy/target/python.py b/loopy/target/python.py
index ce04986d3d2a39dcf7126339055d32fa16ffcc25..ca4b116d0da72ae0c525f92a82732111ca8d1dbe 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -287,9 +287,9 @@ class PythonASTBuilderBase(ASTBuilderBase):
     def can_implement_conditionals(self):
         return True
 
-    def emit_if(self, condition_str, ast):
+    def emit_if(self, condition_mapper, ast):
         from genpy import If
-        return If(condition_str, ast)
+        return If(condition_mapper(), ast)
 
     def emit_assignment(self, codegen_state, insn):
         ecm = codegen_state.expression_to_code_mapper
diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index d4128bd115666cf66c6f06a40823ed9d5929faab..5c149f177c98d192f565b47a4856996424873997 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -104,6 +104,8 @@ def privatize_temporaries_with_inames(
     .. versionadded:: 2018.1
     """
 
+    from loopy.kernel.data import VectorizeTag, IlpBaseTag, filter_iname_tags_by_type
+
     if isinstance(privatizing_inames, str):
         privatizing_inames = frozenset(
                 s.strip()
@@ -114,37 +116,90 @@ def privatize_temporaries_with_inames(
                 s.strip()
                 for s in only_var_names.split(","))
 
+    def find_privitzing_inames(writer_insn, iname, temp_var):
+        # There are now two flavors of privitzing iname promotion, one for ILP and
+        # another for vectorization
+
+        # Temporaries inside an ILP loop have have no additional requirements for
+        # promotion
+
+        # However, we should _not_ assume that this is the case for temporaries
+        # inside a vectorizing loop.  Instead, only temporaries written to by
+        # instructions that directly depend on the vector iname should be promoted.
+        # This is to avoid spurious promotions of constants (not-dependent on the
+        # vector iname) to vector dtypes, for example (w/ j_inner the vectorizing
+        # iname, and 'a' a data-array with a vector dtype on the second axis):
+        #
+        # ```
+        #   for j_outer
+        #       for j_inner
+        #           <> c = function()
+        #           a[c, j_inner] = 1
+        #       end
+        #   end
+        # ```
+        #
+        # is perfectly valid -- however, if c is promoted to a vector-dtype, we will
+        # hit issues with a (potentially) non-constant "vector" index being in a
+        # non-vector axis.  Hence, we must be cautions in vector promotions; those
+        # vector temporaries _not_ written to by a directly vector-iname dependent
+        # instruction will be promoted in the second stage (recursive application of
+        # the write map)
+
+        if filter_iname_tags_by_type(kernel.iname_to_tags[iname], IlpBaseTag):
+            return set([iname])
+        if filter_iname_tags_by_type(kernel.iname_to_tags[iname], VectorizeTag):
+            # For vector inames, we should only consider an iname if the
+            # instruction has a _direct_ dependency on it (to avoid spurious vector
+            # promotions).  Missed promotions will be handled in the recursive
+            # application step
+            return set([iname]) & writer_insn.dependency_names()
+        return set()
+
+    # {{{ Stage 1: find variables that need extra indices
+
+    from collections import defaultdict
+    tv_wmap = defaultdict(lambda: set())
     wmap = kernel.writer_map()
-
     var_to_new_priv_axis_iname = {}
 
-    # {{{ find variables that need extra indices
-
     for tv in six.itervalues(kernel.temporary_variables):
+        # check variables to transform
         if only_var_names is not None and tv.name not in only_var_names:
             continue
 
-        for writer_insn_id in wmap.get(tv.name, []):
+        for writer_insn_id in set(wmap.get(tv.name, [])):
             writer_insn = kernel.id_to_insn[writer_insn_id]
+            test_inames = kernel.insn_inames(writer_insn) & privatizing_inames
 
-            priv_axis_inames = kernel.insn_inames(writer_insn) & privatizing_inames
+            # see stage 2
+            for tv_read in writer_insn.read_dependency_names():
+                if tv_read in kernel.temporary_variables:
+                    tv_wmap[tv_read].add(tv.name)
 
+            priv_axis_inames = set()
+            for ti in test_inames:
+                priv_axis_inames |= find_privitzing_inames(writer_insn, ti, tv)
+
+            priv_axis_inames = frozenset(priv_axis_inames)
             referenced_priv_axis_inames = (priv_axis_inames
-                    & writer_insn.write_dependency_names())
+                & writer_insn.write_dependency_names())
 
             new_priv_axis_inames = priv_axis_inames - referenced_priv_axis_inames
 
             if not new_priv_axis_inames:
-                break
+                continue
 
             if tv.name in var_to_new_priv_axis_iname:
-                if new_priv_axis_inames != set(var_to_new_priv_axis_iname[tv.name]):
+                if new_priv_axis_inames != set(
+                        var_to_new_priv_axis_iname[tv.name]):
+                    # conflict
                     raise LoopyError("instruction '%s' requires adding "
-                            "indices for privatizing var '%s' on iname(s) '%s', "
+                            "indices for vector/ILP inames '%s' on var '%s', "
                             "but previous instructions required inames '%s'"
-                            % (writer_insn_id, tv.name,
-                                ", ".join(new_priv_axis_inames),
-                                ", ".join(var_to_new_priv_axis_iname[tv.name])))
+                            % (writer_insn_id, ", ".join(new_priv_axis_inames),
+                                tv.name, ", ".join(
+                                    var_to_new_priv_axis_iname[tv.name])))
 
                 continue
 
@@ -152,7 +207,51 @@ def privatize_temporaries_with_inames(
 
     # }}}
 
-    # {{{ find ilp iname lengths
+    # {{{ Stage 2: recursively apply vector temporary write heuristic
+
+    # A temporary variable that's only assigned to from other vector
+    # temporaries will never have a direct-dependency on the vector
+    # iname. After building a map of which temporary variables write to
+    # others, we can recursively travel down the temporary variable write-map
+    # of any newly vectorized temporary variable, and extend the
+    # vectorization to those temporary variables dependent on it.
+    #
+    # See ..func: `find_privitzing_inames` for reasoning about vector temporary
+    # promotion
+
+    def recursively_apply(varname, starting_dict, applied=None):
+        if applied is None:
+            # root case, set up set of variables we've already applied to act as
+            # a base case and avoid infinite recursion.
+            applied = set()
+
+        if varname not in tv_wmap or varname in applied:
+            # if no other variables depend on the starting variable, or the starting
+            # variable's privitizing inames have already been applied
+            return starting_dict
+
+        applied.add(varname)
+        for written_to in tv_wmap[varname]:
+            if written_to not in starting_dict:
+                starting_dict[written_to] = set()
+            # update the dependency
+            starting_dict[written_to] |= starting_dict[varname]
+            # and recursively apply to the dependecy's dependencies
+            starting_dict.update(recursively_apply(
+                written_to, starting_dict.copy(), applied=applied))
+
+        return starting_dict
+
+    # apply recursive write heueristic
+    for varname in list(var_to_new_priv_axis_iname.keys()):
+        if any(filter_iname_tags_by_type(kernel.iname_to_tags[iname], VectorizeTag)
+               for iname in var_to_new_priv_axis_iname[varname]):
+            var_to_new_priv_axis_iname.update(recursively_apply(
+                varname, var_to_new_priv_axis_iname.copy()))
+
+    # }}}
+
+    # {{{ find privitizing iname lengths
 
     from loopy.isl_helpers import static_max_of_pw_aff
     from loopy.symbolic import pw_aff_to_expr
@@ -174,8 +273,6 @@ def privatize_temporaries_with_inames(
 
     # {{{ change temporary variables
 
-    from loopy.kernel.data import VectorizeTag
-
     new_temp_vars = kernel.temporary_variables.copy()
     for tv_name, inames in six.iteritems(var_to_new_priv_axis_iname):
         tv = new_temp_vars[tv_name]
@@ -210,13 +307,13 @@ def privatize_temporaries_with_inames(
         new_insn = insn.with_transformed_expressions(eiii)
         if not eiii.seen_priv_axis_inames <= insn.within_inames:
             raise LoopyError(
-                    "Kernel '%s': Instruction '%s': touched variable that "
-                    "(for privatization, e.g. as performed for ILP) "
-                    "required iname(s) '%s', but that the instruction was not "
-                    "previously within the iname(s). To remedy this, first promote"
-                    "the instruction into the iname."
-                    % (kernel.name, insn.id, ", ".join(
-                        eiii.seen_priv_axis_inames - insn.within_inames)))
+                "Kernel '%s': Instruction '%s': touched variable that "
+                "(for privatization, e.g. as performed for ILP) "
+                "required iname(s) '%s', but that the instruction was not "
+                "previously within the iname(s). To remedy this, first promote"
+                "the instruction into the iname."
+                % (kernel.name, insn.id, ", ".join(
+                    eiii.seen_priv_axis_inames - insn.within_inames)))
 
         new_insns.append(new_insn)
 
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 010a0658f71bcfcb037a81c6b61fd9417fc98b75..7716bcf468d6781fb58ca83dee978b3faa6f1485 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -243,6 +243,11 @@ class TypeInferenceMapper(CombineMapper):
             raise LoopyError("Can't cast a '%s' to '%s'" % (subtype, expr.type))
         return [expr.type]
 
+    map_vector_type_cast = map_type_cast
+
+    def map_vector_literal(self, expr):
+        return self.combine([self.rec(child) for child in expr.children])
+
     def map_subscript(self, expr):
         return self.rec(expr.aggregate)
 
@@ -349,14 +354,26 @@ class TypeInferenceMapper(CombineMapper):
         dtype = field[0]
         return [NumpyType(dtype)]
 
-    def map_comparison(self, expr):
+    def map_comparison_types(self, dtype):
         # "bool" is unusable because OpenCL's bool has indeterminate memory
         # format.
-        return [NumpyType(np.dtype(np.int32))]
 
-    map_logical_not = map_comparison
-    map_logical_and = map_comparison
-    map_logical_or = map_comparison
+        if dtype[0].itemsize == 8:
+            return [NumpyType(np.dtype(np.int64))]
+        else:
+            return [NumpyType(np.dtype(np.int32))]
+
+    def map_logical_not(self, expr):
+        return self.map_comparison_types(self.rec(expr.child))
+
+    def map_logical_and(self, expr):
+        return self.map_comparison_types(
+            self.combine([self.rec(x) for x in expr.children]))
+    map_logical_or = map_logical_and
+
+    def map_comparison(self, expr):
+        return self.map_comparison_types(
+            self.combine([self.rec(expr.left), self.rec(expr.right)]))
 
     def map_group_hw_index(self, expr, *args):
         return [self.kernel.index_dtype]
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 89b4f5e639a031d3f2d4d89b470d2ccf5fb4b848..84a23214cb7453d1e200b3cdea17ef4fe401571d 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -770,7 +770,7 @@ def test_vector_types(ctx_factory, vec_len):
 
     ref_knl = knl
 
-    knl = lp.tag_data_axes(knl, "out", "c,vec")
+    knl = lp.tag_array_axes(knl, "out", "c,vec")
     knl = lp.tag_inames(knl, dict(j="unr"))
 
     knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
@@ -2739,7 +2739,7 @@ def test_preamble_with_separate_temporaries(ctx_factory):
 
     print(lp.generate_code(kernel)[0])
     # and call (functionality unimportant, more that it compiles)
-    ctx = cl.create_some_context()
+    ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
     # check that it actually performs the lookup correctly
     assert np.allclose(kernel(
@@ -2800,6 +2800,491 @@ def test_add_prefetch_works_in_lhs_index():
         assert "a1_map" not in get_dependencies(insn.assignees)
 
 
+def test_explicit_simd_shuffles(ctx_factory):
+    ctx = ctx_factory()
+
+    def create_and_test(insn, answer=None, atomic=False, additional_check=None,
+                        store=False):
+        knl = lp.make_kernel(['{[i]: 0 <= i < 12}', '{[j]: 0 <= j < 1}'],
+                             insn,
+                             [lp.GlobalArg('a', shape=(1, 14,), dtype=np.int32,
+                                           for_atomic=atomic),
+                              lp.GlobalArg('b', shape=(1, 14,), dtype=np.int32,
+                                           for_atomic=atomic)])
+
+        knl = lp.split_iname(knl, 'i', 4, inner_tag='vec')
+        knl = lp.tag_inames(knl, [('j', 'g.0')])
+        knl = lp.split_array_axis(knl, ['a', 'b'], 1, 4)
+        knl = lp.tag_array_axes(knl, ['a', 'b'], 'N1,N0,vec')
+
+        print(lp.generate_code_v2(knl).device_code())
+        queue = cl.CommandQueue(ctx)
+        if answer is None:
+            answer = np.zeros(16, dtype=np.int32)
+            if store:
+                answer[2:-2] = np.arange(0, 12, dtype=np.int32)
+            else:
+                answer[:-4] = np.arange(2, 14, dtype=np.int32)
+
+        a = np.zeros((1, 4, 4), dtype=np.int32)
+        b = np.arange(16, dtype=np.int32).reshape((1, 4, 4))
+        result = knl(queue, a=a, b=b)[1][0]
+
+        assert np.array_equal(result.flatten('C'), answer)
+        if additional_check is not None:
+            assert additional_check(knl)
+
+    # test w/ compile time temporary constant
+    create_and_test("<>c = 2\n" +
+                    "a[j, i] = b[j, i + c]",
+                    additional_check=lambda knl: 'vload' in lp.generate_code_v2(
+                        knl).device_code())
+    create_and_test("a[j, i] = b[j, i + 2]")
+    create_and_test("a[j, i] = b[j, i + 2] + a[j, i]")
+    create_and_test("a[j, i] = a[j, i] + b[j, i + 2]")
+    # test vector stores
+    create_and_test("<>c = 2\n" +
+                    "a[j, i + c] = b[j, i]",
+                    additional_check=lambda knl: 'vstore' in lp.generate_code_v2(
+                        knl).device_code(),
+                    store=True)
+    create_and_test("a[j, i + 2] = b[j, i]", store=True)
+    create_and_test("a[j, i + 2] = b[j, i] + a[j, i + 2]", store=True)
+    create_and_test("a[j, i + 2] = a[j, i + 2] + b[j, i]", store=True)
+    # test small vector shuffle
+    shuffled = np.arange(16, dtype=np.int32)[(np.arange(16) + 2) % 4 +
+                                              4 * (np.arange(16) // 4)]
+    shuffled[12:] = 0
+    create_and_test("a[j, i] = b[j, (i + 2) % 4 + 4 * (i // 4)]", shuffled)
+    create_and_test("a[j, (i + 2) % 4 + 4 * (i // 4)] = b[j, i]", shuffled)
+    # test atomics
+    from loopy import LoopyError
+    from loopy.codegen import Unvectorizable
+    with pytest.raises((LoopyError, Unvectorizable)):
+        temp = np.arange(12, dtype=np.int32)
+        answer = np.zeros(4, dtype=np.int32)
+        for i in range(4):
+            answer[i] = np.sum(temp[(i + 2) % 4::4])
+        create_and_test("a[j, (i + 2) % 4] = a[j, (i + 2) % 4] + b[j, i] {atomic}",
+                        answer, True)
+
+
+def test_explicit_simd_unr_iname(ctx_factory):
+    """
+    tests as scatter load to a specific lane of a vector array via an unrolled iname
+    """
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    insns = """
+        for j_outer, lane, i
+            a[j_outer, i, lane] = b[j_outer + lane, i]
+        end
+        """
+    knl = lp.make_kernel(
+        ['{[j_outer]: 0 <= j_outer < 4}',
+         '{[i]: 0 <= i < 4}',
+         '{[lane]: 0 <= lane < 4}'],
+        insns,
+        [lp.GlobalArg('a', shape=(4, 4, 4)),
+         lp.GlobalArg('b', shape=(8, 4))])
+
+    knl = lp.tag_array_axes(knl, 'a', 'N1,N0,vec')
+    knl = lp.tag_inames(knl, {'lane': 'unr'})
+    knl = lp.prioritize_loops(knl, 'j_outer, i, lane')
+
+    a = np.zeros((4, 4, 4))
+    b = np.arange(8 * 4).reshape((8, 4))
+
+    a = knl(queue, a=a, b=b)[1][0]
+    # create answer
+    ans = np.tile(np.arange(4, dtype=np.float64), 16).reshape((4, 4, 4))
+    ans *= 4
+    ans += 4 * np.arange(4)[:, np.newaxis, np.newaxis] + np.arange(4)[:, np.newaxis]
+
+    assert np.array_equal(a, ans)
+
+
+def test_explicit_simd_temporary_promotion(ctx_factory):
+    from loopy.kernel.data import temp_var_scope as scopes
+
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    # fun with vector temporaries
+
+    def make_kernel(insn, ans=None, preamble=None, extra_inames=None, skeleton=None,
+                    dtype=None):
+        skeleton = """
+        %(preamble)s
+        for j
+            for i
+                %(insn)s
+                if test
+                    a[i, j] = 1
+                end
+            end
+        end
+        """ if skeleton is None else skeleton
+        dtype = dtype if dtype is not None else (
+            ans.dtype if ans is not None else np.int32)
+        inames = ['i, j']
+        if extra_inames is not None:
+            inames += list(extra_inames)
+        knl = lp.make_kernel(
+            '{[%(inames)s]: 0 <= %(inames)s < 12}' % {'inames': ', '.join(inames)},
+            skeleton % dict(insn=insn, preamble='' if not preamble else preamble),
+            [lp.GlobalArg('a', shape=(12, 12), dtype=dtype),
+             lp.TemporaryVariable('mask', shape=(12,), initializer=np.array(
+                                  np.arange(12) >= 6, dtype=dtype), read_only=True,
+                                  scope=scopes.GLOBAL)])
+
+        knl = lp.split_iname(knl, 'j', 4, inner_tag='vec')
+        knl = lp.split_array_axis(knl, 'a', 1, 4)
+        knl = lp.tag_array_axes(knl, 'a', 'N1,N0,vec')
+        knl = lp.preprocess_kernel(knl)
+
+        if ans is not None:
+            assert np.array_equal(knl(queue, a=np.zeros((12, 3, 4), dtype=dtype))[
+                1][0], ans)
+
+        return knl
+
+    ans = np.zeros((12, 3, 4))
+    ans[6:, :, :] = 1
+    # case 1) -- incorrect promotion of temporaries to vector dtypes
+    make_kernel('<> test = mask[i]', ans)
+
+    # next test the writer heuristic
+
+    # case 2) assignment from a vector iname
+    knl = make_kernel('<> test = mask[j]')
+    assert knl.temporary_variables['test'].shape == (4,)
+
+    # case 3) recursive dependency
+    knl = make_kernel("""
+        <> test = mask[j]
+        <> test2 = test
+        """)
+    assert knl.temporary_variables['test2'].shape == (4,)
+
+    # case 4)
+    # modified case from pyjac -- what makes this case special is that
+    # Kc is never directly assigned to in an instruction that directly references
+    # the vector iname, j_inner.  Instead, it is a good test of the recursive
+    # vector temporary promotion, as it is written to by B_sum, which _is_ directly
+    # written to from an instruction (bset1) that references j_inner
+    skeleton = """
+    for j
+        %(preamble)s
+        for i
+            %(insn)s
+            if i > 6
+                <> P_val = 100 {id=pset0, nosync=pset1}
+            else
+                P_val = 0.01 {id=pset1, nosync=pset0}
+            end
+            <> B_sum = 0 {id=bset0}
+            for k
+                B_sum = B_sum + k * a[i, j] {id=bset1, dep=*:bset0}
+            end
+            # here, we are testing that Kc is properly promoted to a vector dtype
+            <> P_sum = P_val * i {id=pset2, dep=pset0:pset1}
+            B_sum = exp(B_sum) {id=bset2, dep=bset0:bset1}
+            <> Kc = P_sum * B_sum {id=kset, dep=bset*:pset2}
+            a[i, j] = Kc {dep=*:kset, nosync=pset0:pset1}
+        end
+    end
+    """
+
+    knl = make_kernel('', dtype=np.float32, skeleton=skeleton, extra_inames='k')
+    from loopy.kernel.array import VectorArrayDimTag
+    assert any(isinstance(x, VectorArrayDimTag)
+               for x in knl.temporary_variables['Kc'].dim_tags)
+
+
+def test_explicit_simd_selects(ctx_factory):
+    ctx = ctx_factory()
+
+    def create_and_test(insn, condition, answer, exception=None, a=None, b=None,
+                        extra_insns=None, c=None, v=None, check=None, debug=False):
+        a = np.zeros((3, 4), dtype=np.int32) if a is None else a
+        data = [lp.GlobalArg('a', shape=(12,), dtype=a.dtype)]
+        kwargs = dict(a=a)
+        if b is not None:
+            data += [lp.GlobalArg('b', shape=(12,), dtype=b.dtype)]
+            kwargs['b'] = b
+        if c is not None:
+            data += [lp.GlobalArg('c', shape=(12,), dtype=b.dtype)]
+            kwargs['c'] = c
+        names = [d.name for d in data]
+        # add after defining names to avoid trying to split value arg
+        if v is not None:
+            data += [lp.ValueArg('v', dtype=np.int32)]
+            kwargs['v'] = v
+
+        knl = lp.make_kernel(['{[i]: 0 <= i < 12}'],
+            """
+            for i
+                %(extra)s
+                if %(condition)s
+                    %(insn)s
+                end
+            end
+            """ % dict(condition=condition,
+                       insn=insn,
+                       extra=extra_insns if extra_insns else ''),
+            data
+            )
+
+        knl = lp.split_iname(knl, 'i', 4, inner_tag='vec')
+        knl = lp.split_array_axis(knl, names, 0, 4)
+        knl = lp.tag_array_axes(knl, names, 'N0,vec')
+        if v is not None:
+            knl = lp.set_options(knl, write_wrapper=True)
+
+        queue = cl.CommandQueue(ctx)
+        if check is not None:
+            assert check(knl)
+        elif exception is not None:
+            with pytest.raises(exception):
+                knl(queue, **kwargs)
+        else:
+            if not isinstance(answer, tuple):
+                answer = (answer,)
+            if debug:
+                print(lp.generate_code_v2(knl).device_code())
+            result = knl(queue, **kwargs)[1]
+            for r, a in zip(result, answer):
+                assert np.array_equal(r.flatten('C'), a)
+
+    ans = np.zeros(12, dtype=np.int32)
+    ans[7:] = 1
+    # 1) test a conditional on a vector iname
+    create_and_test('a[i] = 1', 'i > 6', ans)
+    # 2) condition on a vector array
+    create_and_test('a[i] = 1', 'b[i] > 6', ans, b=np.arange(
+        12, dtype=np.int32).reshape((3, 4)))
+    # 3) condition on a vector temporary
+    create_and_test('a[i] = 1', 'c', ans, extra_insns='<> c = (i < 7) - 1')
+    # 4) condition on an assigned vector array, this should work as assignment to a
+    # vector can be safely unrolled
+    create_and_test('a[i] = 1', '(b[i] > 6)', ans,
+                    b=np.zeros((3, 4), dtype=np.int32),
+                    extra_insns='b[i] = i')
+    # 5) a block of simple assignments, this should be seemlessly translated to
+    # multiple vector if statements
+    c_ans = np.ones(12, dtype=np.int32)
+    c_ans[7:] = 0
+    create_and_test('a[i] = 1\nc[i] = 0', '(b[i] > 6)', (ans, c_ans), b=np.arange(
+        12, dtype=np.int32).reshape((3, 4)), c=np.ones((3, 4), dtype=np.int32))
+    # 6) test a negated conditional
+    ans_negated = np.invert(ans) + 2
+    create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange(
+        12, dtype=np.int32).reshape((3, 4)))
+    # 7) test conditional on differing dtype
+    ans_negated = np.invert(ans) + 2
+    create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange(
+        12, dtype=np.int64).reshape((3, 4)))
+    # 8) test conditional on differing dtype (float->int) and (int->float)
+    ans_negated = np.invert(ans) + 2
+    create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange(
+        12, dtype=np.float64).reshape((3, 4)))
+    create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange(
+        12, dtype=np.int64).reshape((3, 4)), a=np.zeros((3, 4), dtype=np.float32))
+    # 9) test conditional on valuearg, the "test" here is that we can actually
+    # generate the code
+    create_and_test('a[i] = 1', 'v', np.ones_like(ans), v=1)
+
+
+@pytest.mark.parametrize(('lhs_dtype', 'rhs_dtype'), [
+    (np.int32, np.int64),
+    (np.float32, np.float64)])
+def test_explicit_vector_dtype_conversion(ctx_factory, lhs_dtype, rhs_dtype):
+    ctx = ctx_factory()
+
+    # test that dtype conversion happens correctly between differing vector-dtypes
+    def __make_kernel(insn, has_conversion=True, uses_temp=True):
+        vw = 4
+        a_lp = lp.GlobalArg('a', shape=(12,), dtype=rhs_dtype)
+        temp_lp = lp.TemporaryVariable('temp', dtype=lhs_dtype)
+
+        knl = lp.make_kernel(['{[i]: 0 <= i < 12}'],
+                """
+                for i
+                    {insn}
+                end
+                """.format(insn=insn),
+                [a_lp, temp_lp],
+                target=lp.PyOpenCLTarget(ctx.devices[0]),
+                silenced_warnings=['temp_to_write(temp)'] if not uses_temp else [])
+        knl = lp.split_iname(knl, 'i', vw, inner_tag='vec')
+        knl = lp.split_array_axis(knl, 'a', 0, 4)
+        knl = lp.tag_array_axes(knl, 'a', 'N0,vec')
+
+        queue = cl.CommandQueue(ctx)
+        # check that the kernel compiles correctly
+        knl(queue, a=np.zeros((12,), dtype=rhs_dtype).reshape((3, 4)))
+
+        # check that we have or don't have a conversion
+        assert ('convert_' in lp.generate_code_v2(knl).device_code()) == \
+            has_conversion
+
+    # test simple dtype conversion
+    __make_kernel("temp = a[i]")
+
+    # test literal assignment
+    __make_kernel("a[i] = 1", False, False)
+
+    # test that a non-vector temporary doesn't trigger conversion
+    #
+    # this should generate the code (e.g.,):
+    #   __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1)))
+    #   loopy_kernel(__global long4 *__restrict__ a)
+    #   {
+    #      int temp;
+    #      for (int i_outer = 0; i_outer <= 2; ++i_outer)
+    #      {
+    #        temp = 1;
+    #        a[i_outer] = temp;
+    #      }
+    #    }
+    #
+    # that is, temp should _not_ be assigned to "a" w/ convert_long4
+    __make_kernel("""
+                      temp = 1
+                      a[i] = temp
+                  """, has_conversion=False)
+
+    # test that the inverse _does_ result in a convers
+    __make_kernel("""
+                      temp = a[i] {id=1, dep=*}
+                      a[i] = temp {id=2, dep=1}
+                  """)
+
+
+@pytest.mark.parametrize('dtype', [np.int32, np.int64, np.float32, np.float64])
+@pytest.mark.parametrize('vec_width', [2, 3, 4, 8, 16])
+def test_explicit_simd_vector_iname_in_conditional(ctx_factory, dtype, vec_width):
+    ctx = ctx_factory()
+
+    size = vec_width * 4
+
+    def create_and_test(insn, answer, shape=(1, size), debug=False,
+                        vectors=['a', 'b']):
+        num_conditions = shape[0]
+        knl = lp.make_kernel(['{{[i]: 0 <= i < {}}}'.format(size),
+                              '{{[j]: 0 <= j < {}}}'.format(num_conditions)],
+                             insn,
+                             [lp.GlobalArg('a', shape=shape, dtype=dtype),
+                              lp.GlobalArg('b', shape=shape, dtype=dtype)])
+
+        knl = lp.split_iname(knl, 'i', 4, inner_tag='vec')
+        knl = lp.tag_inames(knl, [('j', 'g.0')])
+        knl = lp.split_array_axis(knl, ['a', 'b'], 1, 4)
+        knl = lp.tag_array_axes(knl, vectors, 'N1,N0,vec')
+
+        # ensure we can generate code
+        code = lp.generate_code_v2(knl).device_code()
+        if debug:
+            print(code)
+        # and check answer
+        queue = cl.CommandQueue(ctx)
+
+        num_vectors = int(shape[1] / 4)
+        a = np.zeros((num_conditions, num_vectors, 4), dtype=dtype)
+        b = np.arange(num_conditions * num_vectors * 4, dtype=dtype).reshape(
+            (num_conditions, num_vectors, 4))
+        result = knl(queue, a=a, b=b)[1][0]
+
+        assert np.array_equal(result.flatten('C'), answer)
+
+    ans = np.arange(size, dtype=np.int32)
+    ans[:7] = 0
+    create_and_test("""
+        if i >= 7
+            a[j, i] = b[j, i]
+        end
+    """, ans)
+
+    # a case that will result in a unvectorized evaluation
+    # this tests that we are properly able to unwind any vectorized conditional that
+    # has been applied, and then reapply the correct scalar conditional in
+    # unvectorize
+    ans = np.arange(12 * size, dtype=np.int32)
+    ans[:7] = 0
+    create_and_test("""
+        if j * 12 + i >= 7
+            a[j, i] = b[j, i]
+        end
+    """, ans, shape=(12, size), vectors=['b'])
+
+
+def test_vectorizability():
+    # check new vectorizability conditions
+    from loopy.kernel.array import VectorArrayDimTag
+    from loopy.kernel.data import VectorizeTag, filter_iname_tags_by_type
+
+    def create_and_test(insn, exception=None, a=None, b=None):
+        a = np.zeros((3, 4), dtype=np.int32) if a is None else a
+        data = [lp.GlobalArg('a', shape=(12,), dtype=a.dtype)]
+        kwargs = dict(a=a)
+        if b is not None:
+            data += [lp.GlobalArg('b', shape=(12,), dtype=b.dtype)]
+            kwargs['b'] = b
+        names = [d.name for d in data]
+
+        knl = lp.make_kernel(['{[i]: 0 <= i < 12}'],
+            """
+            for i
+                %(insn)s
+            end
+            """ % dict(insn=insn),
+            data
+            )
+
+        knl = lp.split_iname(knl, 'i', 4, inner_tag='vec')
+        knl = lp.split_array_axis(knl, names, 0, 4)
+        knl = lp.tag_array_axes(knl, names, 'N0,vec')
+        knl = lp.preprocess_kernel(knl)
+        lp.generate_code_v2(knl).device_code()
+        assert knl.instructions[0].within_inames & set(['i_inner'])
+        assert isinstance(knl.args[0].dim_tags[-1], VectorArrayDimTag)
+        assert isinstance(knl.args[0].dim_tags[-1], VectorArrayDimTag)
+        assert filter_iname_tags_by_type(knl.iname_to_tags['i_inner'], VectorizeTag)
+
+    def run(op_list=[], unary_operators=[], func_list=[], unary_funcs=[],
+            rvals=['1', 'a[i]']):
+        for op in op_list:
+            template = 'a[i] = a[i] %(op)s %(rval)s' \
+                if op not in unary_operators else 'a[i] = %(op)s a[i]'
+            for rval in rvals:
+                create_and_test(template % dict(op=op, rval=rval))
+        for func in func_list:
+            template = 'a[i] = %(func)s(a[i], %(rval)s)' \
+                if func not in unary_funcs else 'a[i] = %(func)s(a[i])'
+            for rval in rvals:
+                create_and_test(template % dict(func=func, rval=rval))
+
+    # 1) comparisons
+    run(['>', '>=', '<', '<=', '==', '!='])
+
+    # 2) logical operators
+    run(['and', 'or', 'not'], ['not'])
+
+    # 3) bitwise operators
+    # bitwise xor '^' not not implemented in codegen
+    run(['~', '|', '&'], ['~'])
+
+    # 4) functions -- a random selection of the enabled math functions in opencl
+    run(func_list=['acos', 'exp10', 'atan2', 'round'],
+        unary_funcs=['round', 'acos', 'exp10'])
+
+    # 5) remainders and floor division (use 4 instead of 1 to avoid pymbolic
+    #    optimizing out the a[i] % 1)
+    run(['%', '//'], rvals=['a[i]', '4'])
+
+
 def test_check_for_variable_access_ordering():
     knl = lp.make_kernel(
             "{[i]: 0<=i<n}",
diff --git a/test/test_transform.py b/test/test_transform.py
index 3ee67b703964d1f7773b10a9199687d78b883a60..740dc5b5c274b4c234c729612f905b8003dde091 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -260,6 +260,9 @@ def test_vectorize(ctx_factory):
             split_kwargs=dict(slabs=(0, 1)))
 
     knl = lp.tag_data_axes(knl, "a,b", "c,vec")
+    # note: in order to eliminate a vector iname from the (implicit) conditional on
+    # the size of the i-loop, we must fix `n` before generating the kernel
+    knl = lp.fix_parameters(knl, n=30)
     ref_knl = knl
     ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"})
 
@@ -270,8 +273,7 @@ def test_vectorize(ctx_factory):
     code, inf = lp.generate_code(knl)
 
     lp.auto_test_vs_ref(
-            ref_knl, ctx, knl,
-            parameters=dict(n=30))
+            ref_knl, ctx, knl)
 
 
 def test_extract_subst(ctx_factory):