diff --git a/examples/python/vector-types.py b/examples/python/vector-types.py index 328aea154bfee653a6fcbf3fc8ad74b08375e13d..82aadb817c2fa1f75c91da870650436cf0049dc9 100644 --- a/examples/python/vector-types.py +++ b/examples/python/vector-types.py @@ -14,8 +14,9 @@ knl = lp.make_kernel( "out[i] = 2*a[i]") knl = lp.set_options(knl, write_code=True) +knl = lp.fix_parameters(knl, n=n) knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec") knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4) knl = lp.tag_array_axes(knl, "a,out", "C,vec") -knl(queue, a=a.reshape(-1, 4), n=n) +knl(queue, a=a.reshape(-1, 4)) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 11f874e1bd90bcfc1fe4595345c1b1efb2e6a35f..43d5a6112fae8b38effb3dd7a0f12fa4e51058ee 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -24,7 +24,7 @@ THE SOFTWARE. import six -from loopy.diagnostic import LoopyError, warn +from loopy.diagnostic import LoopyError, warn_with_kernel from pytools import ImmutableRecord import islpy as isl @@ -178,6 +178,11 @@ class CodeGenerationState(object): None or an instance of :class:`VectorizationInfo` + .. attribute:: insn_was_not_vectorizable + + If true, we have a call to :func:`try_vectorized` has failed, and we are + in the :func:`unvectorize` fallback + .. attribute:: is_generating_device_code .. attribute:: gen_program_name @@ -196,7 +201,8 @@ class CodeGenerationState(object): vectorization_info=None, var_name_generator=None, is_generating_device_code=None, gen_program_name=None, - schedule_index_end=None): + schedule_index_end=None, + insn_was_not_vectorizable=False): self.kernel = kernel self.implemented_data_info = implemented_data_info self.implemented_domain = implemented_domain @@ -211,6 +217,7 @@ class CodeGenerationState(object): self.is_generating_device_code = is_generating_device_code self.gen_program_name = gen_program_name self.schedule_index_end = schedule_index_end + self.insn_was_not_vectorizable = insn_was_not_vectorizable # {{{ copy helpers @@ -219,7 +226,8 @@ class CodeGenerationState(object): var_subst_map=None, vectorization_info=None, is_generating_device_code=None, gen_program_name=None, - schedule_index_end=None): + schedule_index_end=None, + removed_predicates=frozenset()): if kernel is None: kernel = self.kernel @@ -227,12 +235,14 @@ class CodeGenerationState(object): if implemented_data_info is None: implemented_data_info = self.implemented_data_info + if vectorization_info is None: + vectorization_info = self.vectorization_info + + insn_was_not_vectorizable = self.insn_was_not_vectorizable if vectorization_info is False: + insn_was_not_vectorizable = True vectorization_info = None - elif vectorization_info is None: - vectorization_info = self.vectorization_info - if is_generating_device_code is None: is_generating_device_code = self.is_generating_device_code @@ -247,7 +257,8 @@ class CodeGenerationState(object): implemented_data_info=implemented_data_info, implemented_domain=implemented_domain or self.implemented_domain, implemented_predicates=( - implemented_predicates or self.implemented_predicates), + (implemented_predicates or self.implemented_predicates) - + removed_predicates), seen_dtypes=self.seen_dtypes, seen_functions=self.seen_functions, seen_atomic_dtypes=self.seen_atomic_dtypes, @@ -257,7 +268,8 @@ class CodeGenerationState(object): var_name_generator=self.var_name_generator, is_generating_device_code=is_generating_device_code, gen_program_name=gen_program_name, - schedule_index_end=schedule_index_end) + schedule_index_end=schedule_index_end, + insn_was_not_vectorizable=insn_was_not_vectorizable) def copy_and_assign(self, name, value): """Make a copy of self with variable *name* fixed to *value*.""" @@ -306,7 +318,7 @@ class CodeGenerationState(object): return self.copy_and_assign(iname, expr).copy( implemented_domain=new_impl_domain) - def try_vectorized(self, what, func): + def try_vectorized(self, what, func, vector_kwargs={}): """If *self* is in a vectorizing state (:attr:`vectorization_info` is not None), tries to call func (which must be a callable accepting a single :class:`CodeGenerationState` argument). If this fails with @@ -321,9 +333,9 @@ class CodeGenerationState(object): return func(self) try: - return func(self) + return func(self, **vector_kwargs) except Unvectorizable as e: - warn(self.kernel, "vectorize_failed", + warn_with_kernel(self.kernel, "vectorize_failed", "Vectorization of '%s' failed because '%s'" % (what, e)) @@ -332,7 +344,11 @@ class CodeGenerationState(object): def unvectorize(self, func): vinf = self.vectorization_info result = [] - novec_self = self.copy(vectorization_info=False) + novec_self = self.copy( + vectorization_info=False, + # we must clear the implemented predicates, as they may have been + # generated as vector conditionals, and no longer be valide + removed_predicates=self.implemented_predicates) for i in range(vinf.length): idx_aff = isl.Aff.zero_on_domain(vinf.space.params()) + i diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index e1520a82ed69fa2aed729d9b1d849a78d658c4e1..31d0cbbf72a1373404231062b5dd6e8520577b87 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -510,11 +510,12 @@ def build_loop_nest(codegen_state, schedule_index): pred_chk for pred_chk in pred_checks] prev_result = prev_gen_code(inner_codegen_state) - - return [wrap_in_if( - inner_codegen_state, - condition_exprs, - merge_codegen_results(codegen_state, prev_result))] + inner = merge_codegen_results(codegen_state, prev_result) + return [new_codegen_state.try_vectorized( + inner.current_ast(inner_codegen_state), + lambda ics, **kwargs: wrap_in_if( + ics, condition_exprs, inner, **kwargs), + vector_kwargs={'is_vectorized': True})] cannot_vectorize = False if new_codegen_state.vectorization_info is not None: diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index e590502fb5813af0a820d45228de8e11c35a46c8..e370eef674d48adc7c410ce556c479c10ca8a8e9 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -64,7 +64,7 @@ def to_codegen_result( from pymbolic.primitives import LogicalAnd from pymbolic.mapper.stringifier import PREC_NONE ast = codegen_state.ast_builder.emit_if( - codegen_state.expression_to_code_mapper( + lambda: codegen_state.expression_to_code_mapper( LogicalAnd(tuple(condition_exprs)), PREC_NONE), ast) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 4318ad71c1b16deeaac98f8408d5ca82f2de1714..eb8586f0dd80b839a3d346f181214c9c05579ddd 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -88,6 +88,11 @@ class CodeGenerationResult(ImmutableRecord): a list of :class:`loopy.codegen.ImplementedDataInfo` objects. Only added at the very end of code generation. + + .. attribute:: vectorize_failed + + If True, the currently generated instructions are in the unrolled failed + vectorization state (i.e., 'unvectorize') """ @staticmethod @@ -100,12 +105,12 @@ class CodeGenerationResult(ImmutableRecord): if codegen_state.is_generating_device_code: kwargs = { "host_program": None, - "device_programs": [prg], + "device_programs": [prg] } else: kwargs = { "host_program": prg, - "device_programs": [], + "device_programs": [] } return CodeGenerationResult( @@ -254,17 +259,126 @@ def merge_codegen_results(codegen_state, elements, collapse=True): **kwargs)) -def wrap_in_if(codegen_state, condition_exprs, inner): +def wrap_in_if(codegen_state, condition_exprs, inner, is_vectorized=False): + """ + :param:`is_vectorized` indicates whether the generated AST was successfully + vectorized, or whether it was fed through unvectorize + """ if condition_exprs: from pymbolic.primitives import LogicalAnd from pymbolic.mapper.stringifier import PREC_NONE cur_ast = inner.current_ast(codegen_state) + method = codegen_state.ast_builder.emit_if + + def condition_mapper(ast=None, type_context=None, needed_dtype=None, + condition=None): + if condition is not None: + # explicit vectorization override + pass + else: + condition = LogicalAnd(tuple(condition_exprs)) + return codegen_state.expression_to_code_mapper( + condition, PREC_NONE, + type_context=type_context, needed_dtype=needed_dtype) + mapper = condition_mapper + + if codegen_state.vectorization_info is not None and is_vectorized: + from loopy.symbolic import get_dependencies + from loopy.kernel.array import VectorArrayDimTag + from loopy.kernel.data import ValueArg + + vec_iname = codegen_state.vectorization_info.iname + + # precalculate vector arrays / temporaries + knl = codegen_state.kernel + vec_arys = set([x.name for x in knl.args + list( + knl.temporary_variables.values()) + if not isinstance(x, ValueArg) and any( + isinstance(dt, VectorArrayDimTag) + for dt in x.dim_tags)]) + + def check_vec_dep(condition): + deps = get_dependencies(condition) + # check conditions for explicit vector iname dependecies + if len(deps & set([vec_iname])): + return True + # check for vector temporaries / arrays in conditional + if len(deps & vec_arys): + return True + + if any(check_vec_dep(cond) for cond in condition_exprs): + # condition directly involves a vector array or iname + + def condition_mapper_wrapper(ast=None): + if ast is None: + # default case for printing + return condition_mapper() + + # get the default condition to check for vectorizability + check = condition_mapper() + + # get LHS dtype for (potential) casting of condition + from loopy.expression import dtype_to_type_context + lhs_dtype = codegen_state.expression_to_code_mapper.infer_type( + ast.lvalue.expr) + if not lhs_dtype.is_integral(): + # in OpenCL, the dtype of the conditional in a select call + # must be an integer of the same 'bitness' as the dtype of + # the conditional (https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/select.html) # noqa + # (e.g., float64 -> int64) + from loopy.types import to_loopy_type + import numpy as np + lhs_dtype = to_loopy_type( + np.dtype('i%d' % lhs_dtype.itemsize), + lhs_dtype.target) + type_context = dtype_to_type_context(codegen_state.kernel.target, + lhs_dtype) + + from loopy.symbolic import VectorTypeCast + from loopy.types import to_loopy_type + from pymbolic.primitives import Variable + from pymbolic.mapper.substitutor import substitute + import numpy as np + kwargs = {} + deps = set() + try: + for c in check.expr.children: + deps |= get_dependencies(c) + + if deps & set([vec_iname]): + # we have to insert our own temporary version of the + # vector iname here + # get the vector size + size = codegen_state.vectorization_info.length + # determine the dtype + np_dtype = np.dtype('i%d' % lhs_dtype.itemsize) + dtype = codegen_state.kernel.target.\ + get_dtype_registry().dtype_to_ctype( + to_loopy_type(np_dtype, + target=codegen_state.kernel.target)) + # get the string form + name = '%s%d' % (dtype, size) + # next, get the base of a vector temporary + init = range(size) + # finally, put in a vextor typecast + temp_iname = VectorTypeCast(np_dtype, init, name) + kwargs['condition'] = substitute( + check.expr, {Variable(vec_iname): temp_iname}) + + except (AttributeError, TypeError): + pass + + return condition_mapper( + type_context=type_context, needed_dtype=lhs_dtype, + **kwargs) + + # mark as vector predicates + method = codegen_state.ast_builder.emit_vector_if + mapper = condition_mapper_wrapper + return inner.with_new_ast( codegen_state, - codegen_state.ast_builder.emit_if( - codegen_state.expression_to_code_mapper( - LogicalAnd(tuple(condition_exprs)), PREC_NONE), - cur_ast)) + method(mapper, cur_ast)) return inner diff --git a/loopy/expression.py b/loopy/expression.py index 3269bc09f064f57857eaa5218c8370383e0f735e..06fe3bb06b1071bb6e8a1a80b861197d9eb0ed5f 100644 --- a/loopy/expression.py +++ b/loopy/expression.py @@ -63,6 +63,30 @@ class VectorizabilityChecker(RecursiveMapper): .. attribute:: vec_iname """ + # this is a simple list of math functions from OpenCL-1.2 + # https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/mathFunctions.html + # this could be expanded / moved to it's own target specific VecCheck if + # necessary + functions = """acos acosh acospi asin + asinh asinpi atan atan2 + atanh atanpi atan2pi cbrt + ceil copysign cos cosh + cospi erfc erf exp + exp2 exp10 expm1 fabs + fdim floor fma fmax + fmin fmod fract frexp + hypot ilogb ldexp lgamma + lgamma_r log log2 log10 + log1p logb mad maxmag + minmag modf nan nextafter + pow pown powr remainder + remquo rint rootn round + rsqrt sin sincos sinh + sinpi sqrt tan tanh + tanpi tgamma trunc""" + + functions = [x.strip() for x in functions.split() if x.strip()] + def __init__(self, kernel, vec_iname, vec_iname_length): self.kernel = kernel self.vec_iname = vec_iname @@ -75,7 +99,7 @@ class VectorizabilityChecker(RecursiveMapper): return reduce(and_, vectorizabilities) def map_sum(self, expr): - return any(self.rec(child) for child in expr.children) + return any([self.rec(child) for child in expr.children]) map_product = map_sum @@ -84,6 +108,16 @@ class VectorizabilityChecker(RecursiveMapper): or self.rec(expr.denominator)) + map_remainder = map_quotient + + def map_floor_div(self, expr): + """ + (a) - ( ((a)<0) ? ((b)-1) : 0 ) ) / (b) + """ + a, b = expr.numerator, expr.denominator + return self.rec(a) and self.rec(a.lt(0)) and self.rec(b - 1) and \ + self.rec((a - (b - 1)) / b) and self.rec(a / b) + def map_linear_subscript(self, expr): return False @@ -93,10 +127,54 @@ class VectorizabilityChecker(RecursiveMapper): rec_pars = [ self.rec(child) for child in expr.parameters] if any(rec_pars): - raise Unvectorizable("fucntion calls cannot yet be vectorized") + if str(expr.function) not in VectorizabilityChecker.functions: + return Unvectorizable( + 'Function {} is not known to be vectorizable'.format( + str(expr.function))) + return True return False + @staticmethod + def compile_time_constants(kernel, vec_iname): + """ + Returns a dictionary of (non-vector) inames and temporary variables whose + value is known at "compile" time. These are used (in combination with a + codegen state's variable substitution map) to simplifying access expressions + in :func:`get_access_info`. + + Note: inames are mapped to the :class:`Variable` version of themselves, + while temporary variables are mapped to their integer value + + .. parameter:: kernel + The kernel to check + .. parameter:: vec_iname + the vector iname + + """ + + # determine allowed symbols as non-vector inames + from pymbolic.primitives import Variable + allowed_symbols = dict((sym, Variable(sym)) for sym in kernel.all_inames() + if sym != vec_iname) + from loopy.kernel.instruction import Assignment + from loopy.tools import is_integer + from six import iteritems + + # and compile time integer temporaries + compile_time_assign = dict((str(insn.assignee), insn.expression) + for insn in kernel.instructions if + isinstance(insn, Assignment) and is_integer( + insn.expression)) + allowed_symbols.update( + dict((sym, compile_time_assign[sym]) for sym, var in iteritems( + kernel.temporary_variables) + # temporary variables w/ no initializer, no shape + if var.initializer is None and not var.shape + # compile time integers + and sym in compile_time_assign)) + return allowed_symbols + def map_subscript(self, expr): name = expr.aggregate.name @@ -114,29 +192,45 @@ class VectorizabilityChecker(RecursiveMapper): index = expr.index_tuple - from loopy.symbolic import get_dependencies + from loopy.symbolic import get_dependencies, DependencyMapper from loopy.kernel.array import VectorArrayDimTag - from pymbolic.primitives import Variable possible = None for i in range(len(var.shape)): - if ( - isinstance(var.dim_tags[i], VectorArrayDimTag) - and isinstance(index[i], Variable) - and index[i].name == self.vec_iname): + dep_mapper = DependencyMapper(composite_leaves=False) + deps = dep_mapper(index[i]) + # if we're on the vector index + if isinstance(var.dim_tags[i], VectorArrayDimTag): if var.shape[i] != self.vec_iname_length: raise Unvectorizable("vector length was mismatched") - if possible is None: - possible = True - - else: - if self.vec_iname in get_dependencies(index[i]): - raise Unvectorizable("vectorizing iname '%s' occurs in " - "unvectorized subscript axis %d (1-based) of " - "expression '%s'" - % (self.vec_iname, i+1, expr)) - break + possible = self.vec_iname in [str(x) for x in deps] + # or, if not vector index, and vector iname is present + elif self.vec_iname in set(x.name for x in deps): + # check whether we can simplify out the vector iname + context = dict((str(x), x) for x in deps if x.name != self.vec_iname) + allowed_symbols = self.compile_time_constants( + self.kernel, self.vec_iname) + + from pymbolic import substitute + from pymbolic.mapper.evaluator import UnknownVariableError + from loopy.tools import is_integer + for veci in range(self.vec_iname_length): + ncontext = context.copy() + ncontext[self.vec_iname] = veci + try: + idi = substitute(index[i], ncontext) + if not is_integer(idi) and not all( + x in allowed_symbols + for x in get_dependencies(idi)): + raise Unvectorizable( + "vectorizing iname '%s' occurs in " + "unvectorized subscript axis %d (1-based) of " + "expression '%s', and could not be simplified" + "to compile-time constants." + % (self.vec_iname, i+1, expr)) + except UnknownVariableError: + break return bool(possible) @@ -160,16 +254,31 @@ class VectorizabilityChecker(RecursiveMapper): return False def map_comparison(self, expr): - # FIXME: These actually can be vectorized: # https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/relationalFunctions.html + # even better for OpenCL <, <=, >, >=, !=, == are all vectorizable by default + # (see: sec 6.3.d-6.d.3 in OpenCL-1.2 docs) + + if expr.operator in ["<", "<=", ">", ">=", "!=", "=="]: + return any([self.rec(x) for x in [expr.left, expr.right]]) + raise Unvectorizable() def map_logical_not(self, expr): - raise Unvectorizable() + # 6.3.h in OpenCL-1.2 docs + return self.rec(expr.child) + + def map_logical_and(self, expr): + # 6.3.h in OpenCL-1.2 docs + return any(self.rec(x) for x in expr.children) + + map_logical_or = map_logical_and - map_logical_and = map_logical_not - map_logical_or = map_logical_not + # sec 6.3.f in OpenCL-1.2 docs + map_bitwise_not = map_logical_not + map_bitwise_or = map_logical_and + map_bitwise_xor = map_logical_and + map_bitwise_and = map_logical_and def map_reduction(self, expr): # FIXME: Do this more carefully diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 3588f38af13479b127208c25735f1046eaa82706..f7025f6eebb9fd772574e43268a77ede3436fa67 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -1213,33 +1213,58 @@ class AccessInfo(ImmutableRecord): """ -def get_access_info(target, ary, index, eval_expr, vectorization_info): +def get_access_info(target, ary, index, var_subst_map, vectorization_info): """ :arg ary: an object of type :class:`ArrayBase` :arg index: a tuple of indices representing a subscript into ary + :arg var_subst_map: a context of variable substitutions from the calling codegen + state and potentially other compile-time "constants" (inames and + integer temporaries w/ known values), used in detection of loads / shuffles :arg vectorization_info: an instance of :class:`loopy.codegen.VectorizationInfo`, or *None*. """ import loopy as lp from pymbolic import var + from loopy.codegen import Unvectorizable + from loopy.symbolic import get_dependencies - def eval_expr_assert_integer_constant(i, expr): + def eval_expr_assert_constant(i, expr, kwargs): from pymbolic.mapper.evaluator import UnknownVariableError + # determine error type -- if vectorization_info is None, we're in the + # unvec fallback (and should raise a LoopyError) + # if vectorization_info is 'True', we should raise an Unvectorizable + # on failure + error_type = LoopyError if vectorization_info is None else Unvectorizable + from pymbolic import evaluate try: - result = eval_expr(expr) + result = evaluate(expr, kwargs) except UnknownVariableError as e: - raise LoopyError("When trying to index the array '%s' along axis " + if vectorization_info: + # failed vectorization + raise Unvectorizable( + "When trying to vectorize the array '%s' along axis " "%d (tagged '%s'), the index was not a compile-time " "constant (but it has to be in order for code to be " - "generated). You likely want to unroll the iname(s) '%s'." + "generated). You likely want to unroll the iname(s) '%s'" % (ary.name, i, ary.dim_tags[i], str(e))) + else: + raise LoopyError( + "When trying to unroll the array '%s' along axis " + "%d (tagged '%s'), the index was not an unrollable-iname " + "or constant (but it has to be in order for code to be " + "generated). You likely want to unroll/change array index(s)" + " '%s'" % (ary.name, i, ary.dim_tags[i], str(e))) if not is_integer(result): - raise LoopyError("subscript '%s[%s]' has non-constant " + # try to simplify further + from loopy.isl_helpers import simplify_via_aff + result = simplify_via_aff(result) + + if any([x not in var_subst_map for x in get_dependencies(result)]): + raise error_type("subscript '%s[%s]' has non-constant " "index for separate-array axis %d (0-based)" % ( ary.name, index, i)) - return result def apply_offset(sub): @@ -1290,11 +1315,50 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info): for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)): if isinstance(dim_tag, SeparateArrayArrayDimTag): - idx = eval_expr_assert_integer_constant(i, idx) + idx = eval_expr_assert_constant(i, idx, var_subst_map) array_name += "_s%d" % idx # }}} + def __get_simplified(arr): + from loopy.isl_helpers import simplify_via_aff + return [simplify_via_aff(arr[i]) for i in range(len(arr))] + + def is_contiguous(arr): + from loopy.isl_helpers import simplify_via_aff + from functools import cmp_to_key + if not len(arr) or len(arr) != vector_size: + return False + try: + sarr = sorted(arr) + except TypeError: + # tried to sort a pymbolic expression, try w/ comparison sort + sarr = sorted(arr, key=cmp_to_key( + lambda x, y: simplify_via_aff(x - y) > 0)) + return simplify_via_aff(sarr[-1] - sarr[0] + 1) == vector_size + + def is_monotonic(arr): + if not len(arr): + return False + signs = __get_simplified( + [arr[i + 1] - arr[i] for i in range(len(arr) - 1)]) + # check if array is monotonic increasing / decreasing + signs = [x < 0 for x in signs] + return all(s == signs[0] for s in signs[1:]) + + def run_over_vecrange(i, idx, base_subs): + evaled = [] + for veci in range(vector_size): + try: + subsi = base_subs.copy() + subsi[vectorization_info.iname] = veci + evaled.append(eval_expr_assert_constant(i, idx, subsi)) + except Unvectorizable: + pass + return __get_simplified(evaled) + + vec_op_type = None + # {{{ process remaining dim tags for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)): @@ -1311,6 +1375,27 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info): elif stride is lp.auto: stride = var(array_name + "_stride%d" % i) + if vectorization_info and \ + vectorization_info.iname in get_dependencies(idx): + # need to determine here whether the vector iname is aligned with + # the vector size -> shuffle, or unaligned -> load + evaled = run_over_vecrange(i, idx, var_subst_map) + if is_monotonic(evaled): + vec_op_type = 'shuffle' if all(x == evaled[0] for x in evaled) \ + else 'load' + else: + raise Unvectorizable('Vectorized iname %s present in ' + 'unvectorized axis %s (1-based) access "%s", and not ' + 'simplifiable to compile-time contigous access' % ( + vectorization_info.iname, i + 1, idx)) + elif vectorization_info: + vec_op_type = 'shuffle' # independent of vector iname + + # update vector operation type if necessary + if vector_index is not None and isinstance(vector_index, tuple): + assert vector_index[0] is None # pylint: disable=E1136 + vector_index = (vec_op_type, vector_index[1]) # pylint: disable=E1136; # noqa + subscripts[dim_tag.target_axis] += (stride // vector_size)*idx elif isinstance(dim_tag, SeparateArrayArrayDimTag): @@ -1318,18 +1403,29 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info): elif isinstance(dim_tag, VectorArrayDimTag): from pymbolic.primitives import Variable - if (vectorization_info is not None - and isinstance(index[i], Variable) + if (vectorization_info and isinstance(index[i], Variable) and index[i].name == vectorization_info.iname): # We'll do absolutely nothing here, which will result # in the vector being returned. pass else: - idx = eval_expr_assert_integer_constant(i, idx) - - assert vector_index is None - vector_index = idx + if vectorization_info: + # check dependencies + deps = get_dependencies(idx) - set(var_subst_map.keys()) + if len(deps) == 1 and vectorization_info.iname in deps: + # we depend only on the vectorized iname -- see if we can + # simplify to a load / shuffle + evaled = run_over_vecrange(i, idx, var_subst_map) + if is_contiguous(evaled): + # we can generate a load or shuffle depending on the + # alignment + vector_index = (vec_op_type, evaled) + + if vector_index is None: + # if we haven't generated a load of shuffle... + idx = eval_expr_assert_constant(i, idx, var_subst_map) + vector_index = idx else: raise LoopyError("unsupported array dim implementation tag '%s' " @@ -1345,6 +1441,9 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info): subscripts[0] = apply_offset(subscripts[0]) + if isinstance(vector_index, tuple): + assert vector_index[0] is not None, 'Unknown vectorization type' + return AccessInfo( array_name=array_name, vector_index=vector_index, diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index dd68c950e04e29fc99c456412d9dc4a53dbc61b2..d3da73119ad11fa7be56169d3c4eab14aa3ae903 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -502,6 +502,7 @@ class TemporaryVariable(ArrayBase): the temporary as a ``restrict`` const pointer to the base storage memory location. If *True*, the restrict part is omitted on this declaration. + """ min_target_axes = 0 @@ -514,7 +515,7 @@ class TemporaryVariable(ArrayBase): "base_storage", "initializer", "read_only", - "_base_storage_access_may_be_aliasing", + "_base_storage_access_may_be_aliasing" ] def __init__(self, name, dtype=None, shape=(), address_space=None, diff --git a/loopy/symbolic.py b/loopy/symbolic.py index f5cf07b0e1d62212ce36edb48f47eb7de7d31451..ccbdbaf49d7d84fdb4ee056ae57a50eb0c5aa26c 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -75,6 +75,8 @@ class IdentityMapperMixin(object): def map_array_literal(self, expr, *args): return type(expr)(tuple(self.rec(ch, *args) for ch in expr.children)) + map_vector_literal = map_array_literal + def map_group_hw_index(self, expr, *args): return expr @@ -106,9 +108,11 @@ class IdentityMapperMixin(object): return expr def map_type_annotation(self, expr, *args): - return type(expr)(expr.type, self.rec(expr.child)) + kwargs = {} + return type(expr)(expr.type, self.rec(expr.child), **kwargs) map_type_cast = map_type_annotation + map_vector_type_cast = map_type_annotation map_linear_subscript = IdentityMapperBase.map_subscript @@ -139,6 +143,8 @@ class WalkMapper(WalkMapperBase): for ch in expr.children: self.rec(ch, *args) + map_vector_literal = map_array_literal + def map_group_hw_index(self, expr, *args): self.visit(expr) @@ -156,6 +162,7 @@ class WalkMapper(WalkMapperBase): return self.rec(expr.child, *args) + map_vector_type_cast = map_type_cast map_tagged_variable = WalkMapperBase.map_variable def map_loopy_function_identifier(self, expr, *args): @@ -195,6 +202,10 @@ class StringifyMapper(StringifyMapperBase): def map_array_literal(self, expr, *args): return "{%s}" % ", ".join(self.rec(ch) for ch in expr.children) + def map_vector_literal(self, expr, *args): + from pymbolic.mapper.stringifier import PREC_NONE + return "(%s)" % ", ".join(self.rec(ch, PREC_NONE) for ch in expr.children) + def map_group_hw_index(self, expr, enclosing_prec): return "grp.%d" % expr.index @@ -232,6 +243,11 @@ class StringifyMapper(StringifyMapperBase): from pymbolic.mapper.stringifier import PREC_NONE return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE)) + def map_vector_type_cast(self, expr, enclosing_prec): + from pymbolic.mapper.stringifier import PREC_NONE + return "cast(%s, %s)" % (repr(expr.type_name), self.rec( + expr.child, PREC_NONE)) + class UnidirectionalUnifier(UnidirectionalUnifierBase): def map_reduction(self, expr, other, unis): @@ -289,6 +305,14 @@ class DependencyMapper(DependencyMapperBase): def map_type_cast(self, expr): return self.rec(expr.child) + map_vector_type_cast = map_type_cast + + def map_literal(self, expr): + return set() + + def map_vector_literal(self, expr): + return self.combine(self.rec(child) for child in expr.children) + class SubstitutionRuleExpander(IdentityMapper): def __init__(self, rules): @@ -365,6 +389,25 @@ class ArrayLiteral(p.Leaf): mapper_method = "map_array_literal" +class VectorLiteral(p.Leaf): + """An vector dtype literal.""" + + # Currently only in conjunction with the VectorTypeCast + + def __init__(self, children): + self.children = children + + def stringifier(self): + return StringifyMapper + + def __getinitargs__(self): + return (self.children,) + + init_arg_names = ("children",) + + mapper_method = "map_vector_literal" + + class HardwareAxisIndex(p.Leaf): def __init__(self, axis): self.axis = axis @@ -475,6 +518,62 @@ class TypeCast(p.Expression): mapper_method = intern("map_type_cast") +class VectorTypeCast(p.Expression): + """ + A workaround for casts of vector temporaries, e.g.: + (int4)(0, 1, 2, 3) + + Useful for inserting temporaries into expressions to avoid unvectorizable code + + .. attribute:: type + + The (non-vector) numpy type to cast to. e.g., if using 'int4', the type + would be np.int32 + + .. attribute:: child + + The :class:`VectorLiteral` initializer list to convert to via typecast + + .. attribute:: type_name + + The stringified type (including vector size), e.g., 'int4' + """ + + def __init__(self, type, init, type_name): + super(VectorTypeCast, self).__init__() + + from loopy.types import to_loopy_type, NumpyType + type = to_loopy_type(type) + + if (not isinstance(type, NumpyType) + or not issubclass(type.dtype.type, np.number)): + from loopy.diagnostic import LoopyError + raise LoopyError("TypeCast only supports numerical numpy types, " + "not '%s'" % type) + + # We're storing the type as a name for now to avoid + # numpy pickling bug madness. (see loopy.types) + self.type_name = type_name + self.child = VectorLiteral(tuple(s for s in init)) + self._base_type = type.dtype + + @property + def type(self): + from loopy.types import NumpyType + return NumpyType(self._base_type) + + # init_arg_names is a misnomer--they're attribute names used for pickling. + init_arg_names = ("type_name", "child", "_base_type") + + def __getinitargs__(self): + return (self.type_name, self.child, self._base_type) + + def stringifier(self): + return StringifyMapper + + mapper_method = intern("map_vector_type_cast") + + class TaggedVariable(p.Variable): """This is an identifier with a tag, such as 'matrix$one', where 'one' identifies this specific use of the identifier. This mechanism @@ -1740,6 +1839,8 @@ class BatchedAccessRangeMapper(WalkMapper): def map_type_cast(self, expr, inames): return self.rec(expr.child, inames) + map_vector_type_cast = map_type_cast + class AccessRangeMapper(object): """**IMPORTANT** diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index a81354e2fd7b52ba514af936441c7a2d980c77b5..879f9f1f34092add33dca026d0cba39ab299e090 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -220,7 +220,10 @@ class ASTBuilderBase(object): def can_implement_conditionals(self): return False - def emit_if(self, condition_str, ast): + def emit_if(self, condition_mapper, ast): + raise NotImplementedError() + + def emit_vector_if(self, condition_mapper, ast): raise NotImplementedError() def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 17dd9dc1034f4572e2bcf1d3abc806354c73336e..38a44d5727d4af321bc50a27fc9c56031f082cbe 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -967,9 +967,9 @@ class CASTBuilder(ASTBuilderBase): def can_implement_conditionals(self): return True - def emit_if(self, condition_str, ast): + def emit_if(self, condition_mapper, ast): from cgen import If - return If(condition_str, ast) + return If(condition_mapper(), ast) # }}} diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 8ef921e447bf10d85ac60460f904d528ac64da19..934b9b78db1b3fd6421645c4d1ca6a5d8c250249 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -183,15 +183,22 @@ class ExpressionToCExpressionMapper(IdentityMapper): ary = self.find_array(expr) from loopy.kernel.array import get_access_info - from pymbolic import evaluate from loopy.symbolic import simplify_using_aff index_tuple = tuple( simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple) + from loopy.expression import VectorizabilityChecker + var_subst_map = self.codegen_state.var_subst_map.copy() + if self.codegen_state.vectorization_info: + ctc_iname = self.codegen_state.vectorization_info.iname + ctc = VectorizabilityChecker.compile_time_constants( + self.codegen_state.kernel, + ctc_iname) + var_subst_map.update(ctc) + access_info = get_access_info(self.kernel.target, ary, index_tuple, - lambda expr: evaluate(expr, self.codegen_state.var_subst_map), - self.codegen_state.vectorization_info) + var_subst_map, self.codegen_state.vectorization_info) from loopy.kernel.data import ( ImageArg, ArrayArg, TemporaryVariable, ConstantArg) @@ -252,8 +259,34 @@ class ExpressionToCExpressionMapper(IdentityMapper): self.kernel, self.rec(subscript, 'i'))) if access_info.vector_index is not None: - return self.codegen_state.ast_builder.add_vector_access( - result, access_info.vector_index) + if isinstance(access_info.vector_index, tuple): + # check for specific vector access nodes + try: + method, ind = access_info.vector_index + method = getattr(self.codegen_state.ast_builder, + 'add_vector_%s' % method) + return method(result, + self.codegen_state.vectorization_info.iname, + ary, ind) + except AttributeError: + from loopy.codegen import Unvectorizable + raise Unvectorizable('Target %s has no map node for ' + 'method add_vector_%s' % ( + str(type(self.codegen_state.ast_builder)), + method)) + + try: + from loopy.tools import is_integer + assert is_integer(access_info.vector_index) + return self.codegen_state.ast_builder.add_vector_access( + result, access_info.vector_index) + except AssertionError: + from loopy.codegen import Unvectorizable + raise Unvectorizable( + "Cannot add vector access for non-integer vector addressing " + "did you mean to tag iname '%s' as a vector index?" % ( + access_info.vector_index)) + else: return result @@ -358,6 +391,10 @@ class ExpressionToCExpressionMapper(IdentityMapper): cast = var("(%s)" % registry.dtype_to_ctype(expr.type)) return cast(self.rec(expr.child, type_context)) + def map_vector_type_cast(self, expr, type_context): + cast = var("(%s)" % expr.type_name) + return cast(self.rec(expr.child, type_context)) + def map_constant(self, expr, type_context): if isinstance(expr, (complex, np.complexfloating)): try: @@ -411,9 +448,15 @@ class ExpressionToCExpressionMapper(IdentityMapper): ary = self.find_array(arg) from loopy.kernel.array import get_access_info - from pymbolic import evaluate + var_subst_map = self.codegen_state.var_subst_map.copy() + if self.codegen_state.vectorization_info: + from loopy.expression import VectorizabilityChecker + ctc = VectorizabilityChecker.compile_time_constants( + self.codegen_state.kernel, + self.codegen_state.vectorization_info.iname) + var_subst_map.update(ctc) access_info = get_access_info(self.kernel.target, ary, arg.index, - lambda expr: evaluate(expr, self.codegen_state.var_subst_map), + self.codegen_state.var_subst_map.copy(), self.codegen_state.vectorization_info) from loopy.kernel.data import ImageArg @@ -907,6 +950,9 @@ class CExpressionToCodeMapper(RecursiveMapper): def map_array_literal(self, expr, enclosing_prec): return "{ %s }" % self.join_rec(", ", expr.children, PREC_NONE) + def map_vector_literal(self, expr, enclosing_prec): + return "( %s )" % self.join_rec(", ", expr.children, PREC_NONE) + # }}} # vim: fdm=marker diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index cccee2301e44b16e2454bda5e98af7db7893c003..0e56d9fa69249fd801231102445c7970369cb55a 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -109,11 +109,17 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): if lsize: lsize, = lsize from loopy.kernel.array import get_access_info - from pymbolic import evaluate + + var_subst_map = self.codegen_state.var_subst_map.copy() + if self.codegen_state.vectorization_info: + from loopy.expression import VectorizabilityChecker + ctc = VectorizabilityChecker.compile_time_constants( + self.codegen_state.kernel, + self.codegen_state.vectorization_info.iname) + var_subst_map.update(ctc) access_info = get_access_info(self.kernel.target, ary, expr.index, - lambda expr: evaluate(expr, self.codegen_state.var_subst_map), - self.codegen_state.vectorization_info) + var_subst_map, self.codegen_state.vectorization_info) subscript, = access_info.subscripts result = var(access_info.array_name)[ @@ -397,15 +403,20 @@ class ISPCASTBuilder(CASTBuilder): ary = ecm.find_array(lhs) from loopy.kernel.array import get_access_info - from pymbolic import evaluate - from loopy.symbolic import simplify_using_aff index_tuple = tuple( simplify_using_aff(kernel, idx) for idx in lhs.index_tuple) + var_subst_map = codegen_state.var_subst_map.copy() + if codegen_state.vectorization_info: + from loopy.expression import VectorizabilityChecker + ctc = VectorizabilityChecker.compile_time_constants( + codegen_state.kernel, + codegen_state.vectorization_info.iname) + var_subst_map.update(ctc) + access_info = get_access_info(kernel.target, ary, index_tuple, - lambda expr: evaluate(expr, codegen_state.var_subst_map), - codegen_state.vectorization_info) + var_subst_map, codegen_state.vectorization_info) from loopy.kernel.data import ArrayArg, TemporaryVariable diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 8a6e5284258d864d19d7f1353ec9dfaaa7d72a9b..f976dbd0cc973098ab96841ca6051da7e04c055f 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -34,6 +34,7 @@ from loopy.types import NumpyType from loopy.target.c import DTypeRegistryWrapper, c_math_mangler from loopy.kernel.data import AddressSpace, CallMangleInfo from pymbolic import var +from pymbolic.primitives import Call from functools import partial @@ -148,8 +149,9 @@ _CL_SIMPLE_MULTI_ARG_FUNCTIONS = { } -VECTOR_LITERAL_FUNCS = dict( - ("make_%s%d" % (name, count), (name, dtype, count)) +def get_vector_func(func, template): + return dict( + (template % dict(func=func, name=name, count=count), (name, dtype, count)) for name, dtype in [ ('char', np.int8), ('uchar', np.uint8), @@ -166,6 +168,11 @@ VECTOR_LITERAL_FUNCS = dict( ) +VECTOR_LITERAL_FUNCS = get_vector_func('make', '%(func)s_%(name)s%(count)d') +VECTOR_STORE_FUNCS = get_vector_func('vstore', '%(func)s%(count)d') +VECTOR_LOAD_FUNCS = get_vector_func('vload', '%(func)s%(count)d') + + def opencl_function_mangler(kernel, name, arg_dtypes): if not isinstance(name, str): return None @@ -220,6 +227,27 @@ def opencl_function_mangler(kernel, name, arg_dtypes): NumpyType(dtype), count),), arg_dtypes=(NumpyType(dtype),)*count) + if name in VECTOR_LOAD_FUNCS or name in VECTOR_STORE_FUNCS: + if name in VECTOR_LOAD_FUNCS: + load = True + _, dtype, count = VECTOR_LOAD_FUNCS[name] + result = (kernel.target.vector_dtype(NumpyType(dtype), count),) + args = (kernel.index_dtype, NumpyType(dtype)) + else: + load = False + _, dtype, count = VECTOR_STORE_FUNCS[name] + result = tuple() + args = (kernel.target.vector_dtype(NumpyType(dtype), count), + kernel.index_dtype, NumpyType(dtype)) + + if not ((count == 2 and load) or (count == 3 and not load)): + return None + + return CallMangleInfo( + target_name=name, + result_dtypes=result, + arg_dtypes=args) + return None # }}} @@ -299,6 +327,54 @@ class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper): def map_local_hw_index(self, expr, type_context): return var("lid")(expr.axis) + def map_vector_literal(self, expr, type_context): + return var(', '.join('%s' % x for x in expr.children)) + + def map_comparison(self, expr, type_context): + from loopy.symbolic import get_dependencies + from loopy.kernel.data import VectorizeTag, filter_iname_tags_by_type + from six import iteritems + + vec_inames = set([iname for iname, tags in + iteritems(self.kernel.iname_to_tags) + if filter_iname_tags_by_type(tags, VectorizeTag)]) + + if get_dependencies(expr) & vec_inames and \ + self.codegen_state.insn_was_not_vectorizable: + from loopy.diagnostic import warn_with_kernel + warn_with_kernel(self.codegen_state.kernel, + 'unrolled_vector_iname_conditional', + 'Unrolled vector-loop iname detected in vector ' + + 'comparison; this may in unexpected truth-values.') + + return super(ExpressionToOpenCLCExpressionMapper, self).map_comparison( + expr, type_context) + + def wrap_in_typecast(self, actual_type, needed_dtype, s): + wrap = super(ExpressionToOpenCLCExpressionMapper, self).wrap_in_typecast( + actual_type, needed_dtype, s) + if self.codegen_state.vectorization_info is not None and ( + actual_type != needed_dtype): + from loopy.symbolic import get_dependencies + from loopy.kernel.array import VectorArrayDimTag + rhs_deps = get_dependencies(s) + + def is_vector(var): + return any(isinstance(x, VectorArrayDimTag) for x in var.dim_tags) + # if we have a vector-type on the RHS and the RHS dtype != LHS dtype, + # we need an explicit conversion + rhs_temp_vars = rhs_deps & set(self.kernel.temporary_variables.keys()) + rhs_args = rhs_deps & set(self.kernel.arg_dict.keys()) + if any(is_vector(self.kernel.temporary_variables[x]) + for x in rhs_temp_vars) or any( + is_vector(self.kernel.arg_dict[x]) for x in rhs_args): + ctype = self.kernel.target.get_dtype_registry().dtype_to_ctype( + needed_dtype) + vw = self.codegen_state.vectorization_info.length + # need to add an explicit conversion + return var("convert_%s%d" % (ctype, vw))(wrap) + return wrap + # }}} @@ -359,6 +435,67 @@ class OpenCLTarget(CTarget): # }}} +# {{{ simple opencl function wrappers + + +class VectorFunc(Call): + def __init__(self, function, parameters): + # check that function and parameters are variables + from pymbolic.primitives import Variable, Expression + if not isinstance(function, Variable): + function = var(function) + parameters = list(parameters) + for i, param in enumerate(parameters): + if not isinstance(param, (Variable, Expression)): + parameters[i] = var(str(param)) + super(VectorFunc, self).__init__(function, tuple(parameters)) + + +class VectorStore(VectorFunc): + def __init__(self, vector_width, store, offset, array): + """ + Represents a vstoren + + :arg vector_width: the SIMD vector-width + :arg store: the data to store + :arg offset: the offset in the array + :arg array: the array to store the data in + """ + + name = 'vstore%d' % vector_width + super(VectorStore, self).__init__(name, (store, offset, array)) + + +class VectorLoad(VectorFunc): + def __init__(self, vector_width, offset, array): + """ + Represents a vloadn + + :arg vector_width: the SIMD vector-width + :arg offset: the offset in the array + :arg array: the array to store the data in + """ + + name = 'vload%d' % vector_width + super(VectorLoad, self).__init__(name, (offset, array)) + + +class VectorSelect(VectorFunc): + def __init__(self, select_if_true, select_if_false, condition): + """ + Represents a vector-select + + :arg select_if_true: the value to be chosen if select_on is true + :arg select_if_false: the value to be chosen if select_on is false + :arg condition: the conditional selection value + """ + + name = 'select' + super(VectorSelect, self).__init__(name, ( + select_if_false, select_if_true, condition)) + +# }}} + # {{{ ast builder @@ -437,9 +574,109 @@ class OpenCLCASTBuilder(CASTBuilder): def get_expression_to_c_expression_mapper(self, codegen_state): return ExpressionToOpenCLCExpressionMapper(codegen_state) + def emit_assignment(self, codegen_state, insn): + """ + A wrapper around the base C-target emit_assignment, to handle explicit-SIMD + functions, such as selects, vstore's and vload's and shuffles + """ + + assignment = super(OpenCLCASTBuilder, self).emit_assignment( + codegen_state, insn) + + # fix-up + try: + if isinstance(assignment.lvalue.expr, VectorLoad): + from cgen import Statement + # get vector width + func = str(assignment.lvalue.expr.function) + vw = int(func[func.index('vload') + len('vload'):]) + # convert to vector store + store = VectorStore(vw, assignment.rvalue.expr, + *assignment.lvalue.expr.parameters) + # and to statement + assignment = Statement(str(store)) + except AttributeError: + pass + return assignment + + def emit_vector_if(self, condition_mapper, ast): + """ + Emit's a vector select function + """ + + def vecify(assign): + try: + # treat it as an assignment + return Assign(str(assign.lvalue.expr), str(VectorSelect( + assign.rvalue.expr, assign.lvalue.expr, + condition_mapper(assign)))) + except AttributeError: + return False + + from cgen import Assign, Block + vec_if = vecify(ast) + if not vec_if: + try: + vec_if = [] + for assign in ast.contents: + vec_if.append(vecify(assign)) + if any(not x for x in vec_if): + # one 'assign' failed + vec_if = False + else: + vec_if = Block(vec_if) + except AttributeError: + vec_if = False + if not vec_if and isinstance(ast, Block): + import logging + logger = logging.getLogger(__name__) + logger.info('Cannot convert non-simple assign statement for instruction ' + '(%s) to vectorized conditional. ' + 'Assuming that this is the result of a previous unvectorize ' + 'call.') + return ast + + return vec_if + def add_vector_access(self, access_expr, index): # The 'int' avoids an 'L' suffix for long ints. - return access_expr.attr("s%s" % hex(int(index))[2:]) + def __map(ind, use_prefix=True): + strmap = 's%s' if use_prefix else '%s' + start = 2 + return strmap % hex(int(ind))[start:] + try: + lookup = '' + for i, ind in enumerate(index): + lookup += __map(ind, not i) + except TypeError: + # not iterable + lookup = __map(index) + return access_expr.attr(lookup) + + def add_vector_shuffle(self, access_expr, vec_iname, array, index): + # this can simply call a vector access with the index list + return self.add_vector_access(access_expr, index) + + def add_vector_load(self, access_expr, vec_iname, array, index): + from pymbolic import substitute + # get ctype for casting + ctype = str(array.get_arg_decl( + self, '', array.shape, array.dtype, False)) + ctype = ctype[:ctype.rindex(array.name) - 1] + # and convert the vector access expression to a vector offset + # to do so, we substitute the vector iname -> 0 to eliminate it from the + # expression + offset = substitute(access_expr.index, {vec_iname: 0}) + # try simplify + from pymbolic.mapper.evaluator import UnknownVariableError + try: + from loopy.isl_helpers import simplify_via_aff + offset = simplify_via_aff(offset) + except UnknownVariableError: + pass + # and cast / substitute in the calculated vector iname offset + cast_expr = '&((%s)%s)[%s]' % (ctype, array.name, index[0]) + return VectorLoad(len(index), str(offset), str(cast_expr)) def emit_barrier(self, synchronization_kind, mem_kind, comment): """ @@ -541,6 +778,17 @@ class OpenCLCASTBuilder(CASTBuilder): # FIXME: Could detect operations, generate atomic_{add,...} when # appropriate. + if codegen_state.vectorization_info is not None or \ + codegen_state.insn_was_not_vectorizable: + # note - this check whether we've previously tried to vectorize and + # failed (in which case insn_was_not_vectorizable will be True) or + # whether vectorization_info is a valid :class:`VectorizationInfo` + # + # Both cases should fail (as we can't take the index of an unrolled + # atomic) + raise LoopyError('Atomic operators not implemented for ' + 'explicit-SIMD vectorization') + if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [ np.int32, np.int64, np.float32, np.float64]: from cgen import Block, DoWhile, Assign diff --git a/loopy/target/python.py b/loopy/target/python.py index ce04986d3d2a39dcf7126339055d32fa16ffcc25..ca4b116d0da72ae0c525f92a82732111ca8d1dbe 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -287,9 +287,9 @@ class PythonASTBuilderBase(ASTBuilderBase): def can_implement_conditionals(self): return True - def emit_if(self, condition_str, ast): + def emit_if(self, condition_mapper, ast): from genpy import If - return If(condition_str, ast) + return If(condition_mapper(), ast) def emit_assignment(self, codegen_state, insn): ecm = codegen_state.expression_to_code_mapper diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py index d4128bd115666cf66c6f06a40823ed9d5929faab..5c149f177c98d192f565b47a4856996424873997 100644 --- a/loopy/transform/privatize.py +++ b/loopy/transform/privatize.py @@ -104,6 +104,8 @@ def privatize_temporaries_with_inames( .. versionadded:: 2018.1 """ + from loopy.kernel.data import VectorizeTag, IlpBaseTag, filter_iname_tags_by_type + if isinstance(privatizing_inames, str): privatizing_inames = frozenset( s.strip() @@ -114,37 +116,90 @@ def privatize_temporaries_with_inames( s.strip() for s in only_var_names.split(",")) + def find_privitzing_inames(writer_insn, iname, temp_var): + # There are now two flavors of privitzing iname promotion, one for ILP and + # another for vectorization + + # Temporaries inside an ILP loop have have no additional requirements for + # promotion + + # However, we should _not_ assume that this is the case for temporaries + # inside a vectorizing loop. Instead, only temporaries written to by + # instructions that directly depend on the vector iname should be promoted. + # This is to avoid spurious promotions of constants (not-dependent on the + # vector iname) to vector dtypes, for example (w/ j_inner the vectorizing + # iname, and 'a' a data-array with a vector dtype on the second axis): + # + # ``` + # for j_outer + # for j_inner + # <> c = function() + # a[c, j_inner] = 1 + # end + # end + # ``` + # + # is perfectly valid -- however, if c is promoted to a vector-dtype, we will + # hit issues with a (potentially) non-constant "vector" index being in a + # non-vector axis. Hence, we must be cautions in vector promotions; those + # vector temporaries _not_ written to by a directly vector-iname dependent + # instruction will be promoted in the second stage (recursive application of + # the write map) + + if filter_iname_tags_by_type(kernel.iname_to_tags[iname], IlpBaseTag): + return set([iname]) + if filter_iname_tags_by_type(kernel.iname_to_tags[iname], VectorizeTag): + # For vector inames, we should only consider an iname if the + # instruction has a _direct_ dependency on it (to avoid spurious vector + # promotions). Missed promotions will be handled in the recursive + # application step + return set([iname]) & writer_insn.dependency_names() + return set() + + # {{{ Stage 1: find variables that need extra indices + + from collections import defaultdict + tv_wmap = defaultdict(lambda: set()) wmap = kernel.writer_map() - var_to_new_priv_axis_iname = {} - # {{{ find variables that need extra indices - for tv in six.itervalues(kernel.temporary_variables): + # check variables to transform if only_var_names is not None and tv.name not in only_var_names: continue - for writer_insn_id in wmap.get(tv.name, []): + for writer_insn_id in set(wmap.get(tv.name, [])): writer_insn = kernel.id_to_insn[writer_insn_id] + test_inames = kernel.insn_inames(writer_insn) & privatizing_inames - priv_axis_inames = kernel.insn_inames(writer_insn) & privatizing_inames + # see stage 2 + for tv_read in writer_insn.read_dependency_names(): + if tv_read in kernel.temporary_variables: + tv_wmap[tv_read].add(tv.name) + priv_axis_inames = set() + for ti in test_inames: + priv_axis_inames |= find_privitzing_inames(writer_insn, ti, tv) + + priv_axis_inames = frozenset(priv_axis_inames) referenced_priv_axis_inames = (priv_axis_inames - & writer_insn.write_dependency_names()) + & writer_insn.write_dependency_names()) new_priv_axis_inames = priv_axis_inames - referenced_priv_axis_inames if not new_priv_axis_inames: - break + continue if tv.name in var_to_new_priv_axis_iname: - if new_priv_axis_inames != set(var_to_new_priv_axis_iname[tv.name]): + if new_priv_axis_inames != set( + var_to_new_priv_axis_iname[tv.name]): + # conflict raise LoopyError("instruction '%s' requires adding " - "indices for privatizing var '%s' on iname(s) '%s', " + "indices for vector/ILP inames '%s' on var '%s', " "but previous instructions required inames '%s'" - % (writer_insn_id, tv.name, - ", ".join(new_priv_axis_inames), - ", ".join(var_to_new_priv_axis_iname[tv.name]))) + % (writer_insn_id, ", ".join(new_priv_axis_inames), + tv.name, ", ".join( + var_to_new_priv_axis_iname[tv.name]))) continue @@ -152,7 +207,51 @@ def privatize_temporaries_with_inames( # }}} - # {{{ find ilp iname lengths + # {{{ Stage 2: recursively apply vector temporary write heuristic + + # A temporary variable that's only assigned to from other vector + # temporaries will never have a direct-dependency on the vector + # iname. After building a map of which temporary variables write to + # others, we can recursively travel down the temporary variable write-map + # of any newly vectorized temporary variable, and extend the + # vectorization to those temporary variables dependent on it. + # + # See ..func: `find_privitzing_inames` for reasoning about vector temporary + # promotion + + def recursively_apply(varname, starting_dict, applied=None): + if applied is None: + # root case, set up set of variables we've already applied to act as + # a base case and avoid infinite recursion. + applied = set() + + if varname not in tv_wmap or varname in applied: + # if no other variables depend on the starting variable, or the starting + # variable's privitizing inames have already been applied + return starting_dict + + applied.add(varname) + for written_to in tv_wmap[varname]: + if written_to not in starting_dict: + starting_dict[written_to] = set() + # update the dependency + starting_dict[written_to] |= starting_dict[varname] + # and recursively apply to the dependecy's dependencies + starting_dict.update(recursively_apply( + written_to, starting_dict.copy(), applied=applied)) + + return starting_dict + + # apply recursive write heueristic + for varname in list(var_to_new_priv_axis_iname.keys()): + if any(filter_iname_tags_by_type(kernel.iname_to_tags[iname], VectorizeTag) + for iname in var_to_new_priv_axis_iname[varname]): + var_to_new_priv_axis_iname.update(recursively_apply( + varname, var_to_new_priv_axis_iname.copy())) + + # }}} + + # {{{ find privitizing iname lengths from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr @@ -174,8 +273,6 @@ def privatize_temporaries_with_inames( # {{{ change temporary variables - from loopy.kernel.data import VectorizeTag - new_temp_vars = kernel.temporary_variables.copy() for tv_name, inames in six.iteritems(var_to_new_priv_axis_iname): tv = new_temp_vars[tv_name] @@ -210,13 +307,13 @@ def privatize_temporaries_with_inames( new_insn = insn.with_transformed_expressions(eiii) if not eiii.seen_priv_axis_inames <= insn.within_inames: raise LoopyError( - "Kernel '%s': Instruction '%s': touched variable that " - "(for privatization, e.g. as performed for ILP) " - "required iname(s) '%s', but that the instruction was not " - "previously within the iname(s). To remedy this, first promote" - "the instruction into the iname." - % (kernel.name, insn.id, ", ".join( - eiii.seen_priv_axis_inames - insn.within_inames))) + "Kernel '%s': Instruction '%s': touched variable that " + "(for privatization, e.g. as performed for ILP) " + "required iname(s) '%s', but that the instruction was not " + "previously within the iname(s). To remedy this, first promote" + "the instruction into the iname." + % (kernel.name, insn.id, ", ".join( + eiii.seen_priv_axis_inames - insn.within_inames))) new_insns.append(new_insn) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 010a0658f71bcfcb037a81c6b61fd9417fc98b75..7716bcf468d6781fb58ca83dee978b3faa6f1485 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -243,6 +243,11 @@ class TypeInferenceMapper(CombineMapper): raise LoopyError("Can't cast a '%s' to '%s'" % (subtype, expr.type)) return [expr.type] + map_vector_type_cast = map_type_cast + + def map_vector_literal(self, expr): + return self.combine([self.rec(child) for child in expr.children]) + def map_subscript(self, expr): return self.rec(expr.aggregate) @@ -349,14 +354,26 @@ class TypeInferenceMapper(CombineMapper): dtype = field[0] return [NumpyType(dtype)] - def map_comparison(self, expr): + def map_comparison_types(self, dtype): # "bool" is unusable because OpenCL's bool has indeterminate memory # format. - return [NumpyType(np.dtype(np.int32))] - map_logical_not = map_comparison - map_logical_and = map_comparison - map_logical_or = map_comparison + if dtype[0].itemsize == 8: + return [NumpyType(np.dtype(np.int64))] + else: + return [NumpyType(np.dtype(np.int32))] + + def map_logical_not(self, expr): + return self.map_comparison_types(self.rec(expr.child)) + + def map_logical_and(self, expr): + return self.map_comparison_types( + self.combine([self.rec(x) for x in expr.children])) + map_logical_or = map_logical_and + + def map_comparison(self, expr): + return self.map_comparison_types( + self.combine([self.rec(expr.left), self.rec(expr.right)])) def map_group_hw_index(self, expr, *args): return [self.kernel.index_dtype] diff --git a/test/test_loopy.py b/test/test_loopy.py index 89b4f5e639a031d3f2d4d89b470d2ccf5fb4b848..84a23214cb7453d1e200b3cdea17ef4fe401571d 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -770,7 +770,7 @@ def test_vector_types(ctx_factory, vec_len): ref_knl = knl - knl = lp.tag_data_axes(knl, "out", "c,vec") + knl = lp.tag_array_axes(knl, "out", "c,vec") knl = lp.tag_inames(knl, dict(j="unr")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") @@ -2739,7 +2739,7 @@ def test_preamble_with_separate_temporaries(ctx_factory): print(lp.generate_code(kernel)[0]) # and call (functionality unimportant, more that it compiles) - ctx = cl.create_some_context() + ctx = ctx_factory() queue = cl.CommandQueue(ctx) # check that it actually performs the lookup correctly assert np.allclose(kernel( @@ -2800,6 +2800,491 @@ def test_add_prefetch_works_in_lhs_index(): assert "a1_map" not in get_dependencies(insn.assignees) +def test_explicit_simd_shuffles(ctx_factory): + ctx = ctx_factory() + + def create_and_test(insn, answer=None, atomic=False, additional_check=None, + store=False): + knl = lp.make_kernel(['{[i]: 0 <= i < 12}', '{[j]: 0 <= j < 1}'], + insn, + [lp.GlobalArg('a', shape=(1, 14,), dtype=np.int32, + for_atomic=atomic), + lp.GlobalArg('b', shape=(1, 14,), dtype=np.int32, + for_atomic=atomic)]) + + knl = lp.split_iname(knl, 'i', 4, inner_tag='vec') + knl = lp.tag_inames(knl, [('j', 'g.0')]) + knl = lp.split_array_axis(knl, ['a', 'b'], 1, 4) + knl = lp.tag_array_axes(knl, ['a', 'b'], 'N1,N0,vec') + + print(lp.generate_code_v2(knl).device_code()) + queue = cl.CommandQueue(ctx) + if answer is None: + answer = np.zeros(16, dtype=np.int32) + if store: + answer[2:-2] = np.arange(0, 12, dtype=np.int32) + else: + answer[:-4] = np.arange(2, 14, dtype=np.int32) + + a = np.zeros((1, 4, 4), dtype=np.int32) + b = np.arange(16, dtype=np.int32).reshape((1, 4, 4)) + result = knl(queue, a=a, b=b)[1][0] + + assert np.array_equal(result.flatten('C'), answer) + if additional_check is not None: + assert additional_check(knl) + + # test w/ compile time temporary constant + create_and_test("<>c = 2\n" + + "a[j, i] = b[j, i + c]", + additional_check=lambda knl: 'vload' in lp.generate_code_v2( + knl).device_code()) + create_and_test("a[j, i] = b[j, i + 2]") + create_and_test("a[j, i] = b[j, i + 2] + a[j, i]") + create_and_test("a[j, i] = a[j, i] + b[j, i + 2]") + # test vector stores + create_and_test("<>c = 2\n" + + "a[j, i + c] = b[j, i]", + additional_check=lambda knl: 'vstore' in lp.generate_code_v2( + knl).device_code(), + store=True) + create_and_test("a[j, i + 2] = b[j, i]", store=True) + create_and_test("a[j, i + 2] = b[j, i] + a[j, i + 2]", store=True) + create_and_test("a[j, i + 2] = a[j, i + 2] + b[j, i]", store=True) + # test small vector shuffle + shuffled = np.arange(16, dtype=np.int32)[(np.arange(16) + 2) % 4 + + 4 * (np.arange(16) // 4)] + shuffled[12:] = 0 + create_and_test("a[j, i] = b[j, (i + 2) % 4 + 4 * (i // 4)]", shuffled) + create_and_test("a[j, (i + 2) % 4 + 4 * (i // 4)] = b[j, i]", shuffled) + # test atomics + from loopy import LoopyError + from loopy.codegen import Unvectorizable + with pytest.raises((LoopyError, Unvectorizable)): + temp = np.arange(12, dtype=np.int32) + answer = np.zeros(4, dtype=np.int32) + for i in range(4): + answer[i] = np.sum(temp[(i + 2) % 4::4]) + create_and_test("a[j, (i + 2) % 4] = a[j, (i + 2) % 4] + b[j, i] {atomic}", + answer, True) + + +def test_explicit_simd_unr_iname(ctx_factory): + """ + tests as scatter load to a specific lane of a vector array via an unrolled iname + """ + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + insns = """ + for j_outer, lane, i + a[j_outer, i, lane] = b[j_outer + lane, i] + end + """ + knl = lp.make_kernel( + ['{[j_outer]: 0 <= j_outer < 4}', + '{[i]: 0 <= i < 4}', + '{[lane]: 0 <= lane < 4}'], + insns, + [lp.GlobalArg('a', shape=(4, 4, 4)), + lp.GlobalArg('b', shape=(8, 4))]) + + knl = lp.tag_array_axes(knl, 'a', 'N1,N0,vec') + knl = lp.tag_inames(knl, {'lane': 'unr'}) + knl = lp.prioritize_loops(knl, 'j_outer, i, lane') + + a = np.zeros((4, 4, 4)) + b = np.arange(8 * 4).reshape((8, 4)) + + a = knl(queue, a=a, b=b)[1][0] + # create answer + ans = np.tile(np.arange(4, dtype=np.float64), 16).reshape((4, 4, 4)) + ans *= 4 + ans += 4 * np.arange(4)[:, np.newaxis, np.newaxis] + np.arange(4)[:, np.newaxis] + + assert np.array_equal(a, ans) + + +def test_explicit_simd_temporary_promotion(ctx_factory): + from loopy.kernel.data import temp_var_scope as scopes + + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + # fun with vector temporaries + + def make_kernel(insn, ans=None, preamble=None, extra_inames=None, skeleton=None, + dtype=None): + skeleton = """ + %(preamble)s + for j + for i + %(insn)s + if test + a[i, j] = 1 + end + end + end + """ if skeleton is None else skeleton + dtype = dtype if dtype is not None else ( + ans.dtype if ans is not None else np.int32) + inames = ['i, j'] + if extra_inames is not None: + inames += list(extra_inames) + knl = lp.make_kernel( + '{[%(inames)s]: 0 <= %(inames)s < 12}' % {'inames': ', '.join(inames)}, + skeleton % dict(insn=insn, preamble='' if not preamble else preamble), + [lp.GlobalArg('a', shape=(12, 12), dtype=dtype), + lp.TemporaryVariable('mask', shape=(12,), initializer=np.array( + np.arange(12) >= 6, dtype=dtype), read_only=True, + scope=scopes.GLOBAL)]) + + knl = lp.split_iname(knl, 'j', 4, inner_tag='vec') + knl = lp.split_array_axis(knl, 'a', 1, 4) + knl = lp.tag_array_axes(knl, 'a', 'N1,N0,vec') + knl = lp.preprocess_kernel(knl) + + if ans is not None: + assert np.array_equal(knl(queue, a=np.zeros((12, 3, 4), dtype=dtype))[ + 1][0], ans) + + return knl + + ans = np.zeros((12, 3, 4)) + ans[6:, :, :] = 1 + # case 1) -- incorrect promotion of temporaries to vector dtypes + make_kernel('<> test = mask[i]', ans) + + # next test the writer heuristic + + # case 2) assignment from a vector iname + knl = make_kernel('<> test = mask[j]') + assert knl.temporary_variables['test'].shape == (4,) + + # case 3) recursive dependency + knl = make_kernel(""" + <> test = mask[j] + <> test2 = test + """) + assert knl.temporary_variables['test2'].shape == (4,) + + # case 4) + # modified case from pyjac -- what makes this case special is that + # Kc is never directly assigned to in an instruction that directly references + # the vector iname, j_inner. Instead, it is a good test of the recursive + # vector temporary promotion, as it is written to by B_sum, which _is_ directly + # written to from an instruction (bset1) that references j_inner + skeleton = """ + for j + %(preamble)s + for i + %(insn)s + if i > 6 + <> P_val = 100 {id=pset0, nosync=pset1} + else + P_val = 0.01 {id=pset1, nosync=pset0} + end + <> B_sum = 0 {id=bset0} + for k + B_sum = B_sum + k * a[i, j] {id=bset1, dep=*:bset0} + end + # here, we are testing that Kc is properly promoted to a vector dtype + <> P_sum = P_val * i {id=pset2, dep=pset0:pset1} + B_sum = exp(B_sum) {id=bset2, dep=bset0:bset1} + <> Kc = P_sum * B_sum {id=kset, dep=bset*:pset2} + a[i, j] = Kc {dep=*:kset, nosync=pset0:pset1} + end + end + """ + + knl = make_kernel('', dtype=np.float32, skeleton=skeleton, extra_inames='k') + from loopy.kernel.array import VectorArrayDimTag + assert any(isinstance(x, VectorArrayDimTag) + for x in knl.temporary_variables['Kc'].dim_tags) + + +def test_explicit_simd_selects(ctx_factory): + ctx = ctx_factory() + + def create_and_test(insn, condition, answer, exception=None, a=None, b=None, + extra_insns=None, c=None, v=None, check=None, debug=False): + a = np.zeros((3, 4), dtype=np.int32) if a is None else a + data = [lp.GlobalArg('a', shape=(12,), dtype=a.dtype)] + kwargs = dict(a=a) + if b is not None: + data += [lp.GlobalArg('b', shape=(12,), dtype=b.dtype)] + kwargs['b'] = b + if c is not None: + data += [lp.GlobalArg('c', shape=(12,), dtype=b.dtype)] + kwargs['c'] = c + names = [d.name for d in data] + # add after defining names to avoid trying to split value arg + if v is not None: + data += [lp.ValueArg('v', dtype=np.int32)] + kwargs['v'] = v + + knl = lp.make_kernel(['{[i]: 0 <= i < 12}'], + """ + for i + %(extra)s + if %(condition)s + %(insn)s + end + end + """ % dict(condition=condition, + insn=insn, + extra=extra_insns if extra_insns else ''), + data + ) + + knl = lp.split_iname(knl, 'i', 4, inner_tag='vec') + knl = lp.split_array_axis(knl, names, 0, 4) + knl = lp.tag_array_axes(knl, names, 'N0,vec') + if v is not None: + knl = lp.set_options(knl, write_wrapper=True) + + queue = cl.CommandQueue(ctx) + if check is not None: + assert check(knl) + elif exception is not None: + with pytest.raises(exception): + knl(queue, **kwargs) + else: + if not isinstance(answer, tuple): + answer = (answer,) + if debug: + print(lp.generate_code_v2(knl).device_code()) + result = knl(queue, **kwargs)[1] + for r, a in zip(result, answer): + assert np.array_equal(r.flatten('C'), a) + + ans = np.zeros(12, dtype=np.int32) + ans[7:] = 1 + # 1) test a conditional on a vector iname + create_and_test('a[i] = 1', 'i > 6', ans) + # 2) condition on a vector array + create_and_test('a[i] = 1', 'b[i] > 6', ans, b=np.arange( + 12, dtype=np.int32).reshape((3, 4))) + # 3) condition on a vector temporary + create_and_test('a[i] = 1', 'c', ans, extra_insns='<> c = (i < 7) - 1') + # 4) condition on an assigned vector array, this should work as assignment to a + # vector can be safely unrolled + create_and_test('a[i] = 1', '(b[i] > 6)', ans, + b=np.zeros((3, 4), dtype=np.int32), + extra_insns='b[i] = i') + # 5) a block of simple assignments, this should be seemlessly translated to + # multiple vector if statements + c_ans = np.ones(12, dtype=np.int32) + c_ans[7:] = 0 + create_and_test('a[i] = 1\nc[i] = 0', '(b[i] > 6)', (ans, c_ans), b=np.arange( + 12, dtype=np.int32).reshape((3, 4)), c=np.ones((3, 4), dtype=np.int32)) + # 6) test a negated conditional + ans_negated = np.invert(ans) + 2 + create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange( + 12, dtype=np.int32).reshape((3, 4))) + # 7) test conditional on differing dtype + ans_negated = np.invert(ans) + 2 + create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange( + 12, dtype=np.int64).reshape((3, 4))) + # 8) test conditional on differing dtype (float->int) and (int->float) + ans_negated = np.invert(ans) + 2 + create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange( + 12, dtype=np.float64).reshape((3, 4))) + create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange( + 12, dtype=np.int64).reshape((3, 4)), a=np.zeros((3, 4), dtype=np.float32)) + # 9) test conditional on valuearg, the "test" here is that we can actually + # generate the code + create_and_test('a[i] = 1', 'v', np.ones_like(ans), v=1) + + +@pytest.mark.parametrize(('lhs_dtype', 'rhs_dtype'), [ + (np.int32, np.int64), + (np.float32, np.float64)]) +def test_explicit_vector_dtype_conversion(ctx_factory, lhs_dtype, rhs_dtype): + ctx = ctx_factory() + + # test that dtype conversion happens correctly between differing vector-dtypes + def __make_kernel(insn, has_conversion=True, uses_temp=True): + vw = 4 + a_lp = lp.GlobalArg('a', shape=(12,), dtype=rhs_dtype) + temp_lp = lp.TemporaryVariable('temp', dtype=lhs_dtype) + + knl = lp.make_kernel(['{[i]: 0 <= i < 12}'], + """ + for i + {insn} + end + """.format(insn=insn), + [a_lp, temp_lp], + target=lp.PyOpenCLTarget(ctx.devices[0]), + silenced_warnings=['temp_to_write(temp)'] if not uses_temp else []) + knl = lp.split_iname(knl, 'i', vw, inner_tag='vec') + knl = lp.split_array_axis(knl, 'a', 0, 4) + knl = lp.tag_array_axes(knl, 'a', 'N0,vec') + + queue = cl.CommandQueue(ctx) + # check that the kernel compiles correctly + knl(queue, a=np.zeros((12,), dtype=rhs_dtype).reshape((3, 4))) + + # check that we have or don't have a conversion + assert ('convert_' in lp.generate_code_v2(knl).device_code()) == \ + has_conversion + + # test simple dtype conversion + __make_kernel("temp = a[i]") + + # test literal assignment + __make_kernel("a[i] = 1", False, False) + + # test that a non-vector temporary doesn't trigger conversion + # + # this should generate the code (e.g.,): + # __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) + # loopy_kernel(__global long4 *__restrict__ a) + # { + # int temp; + # for (int i_outer = 0; i_outer <= 2; ++i_outer) + # { + # temp = 1; + # a[i_outer] = temp; + # } + # } + # + # that is, temp should _not_ be assigned to "a" w/ convert_long4 + __make_kernel(""" + temp = 1 + a[i] = temp + """, has_conversion=False) + + # test that the inverse _does_ result in a convers + __make_kernel(""" + temp = a[i] {id=1, dep=*} + a[i] = temp {id=2, dep=1} + """) + + +@pytest.mark.parametrize('dtype', [np.int32, np.int64, np.float32, np.float64]) +@pytest.mark.parametrize('vec_width', [2, 3, 4, 8, 16]) +def test_explicit_simd_vector_iname_in_conditional(ctx_factory, dtype, vec_width): + ctx = ctx_factory() + + size = vec_width * 4 + + def create_and_test(insn, answer, shape=(1, size), debug=False, + vectors=['a', 'b']): + num_conditions = shape[0] + knl = lp.make_kernel(['{{[i]: 0 <= i < {}}}'.format(size), + '{{[j]: 0 <= j < {}}}'.format(num_conditions)], + insn, + [lp.GlobalArg('a', shape=shape, dtype=dtype), + lp.GlobalArg('b', shape=shape, dtype=dtype)]) + + knl = lp.split_iname(knl, 'i', 4, inner_tag='vec') + knl = lp.tag_inames(knl, [('j', 'g.0')]) + knl = lp.split_array_axis(knl, ['a', 'b'], 1, 4) + knl = lp.tag_array_axes(knl, vectors, 'N1,N0,vec') + + # ensure we can generate code + code = lp.generate_code_v2(knl).device_code() + if debug: + print(code) + # and check answer + queue = cl.CommandQueue(ctx) + + num_vectors = int(shape[1] / 4) + a = np.zeros((num_conditions, num_vectors, 4), dtype=dtype) + b = np.arange(num_conditions * num_vectors * 4, dtype=dtype).reshape( + (num_conditions, num_vectors, 4)) + result = knl(queue, a=a, b=b)[1][0] + + assert np.array_equal(result.flatten('C'), answer) + + ans = np.arange(size, dtype=np.int32) + ans[:7] = 0 + create_and_test(""" + if i >= 7 + a[j, i] = b[j, i] + end + """, ans) + + # a case that will result in a unvectorized evaluation + # this tests that we are properly able to unwind any vectorized conditional that + # has been applied, and then reapply the correct scalar conditional in + # unvectorize + ans = np.arange(12 * size, dtype=np.int32) + ans[:7] = 0 + create_and_test(""" + if j * 12 + i >= 7 + a[j, i] = b[j, i] + end + """, ans, shape=(12, size), vectors=['b']) + + +def test_vectorizability(): + # check new vectorizability conditions + from loopy.kernel.array import VectorArrayDimTag + from loopy.kernel.data import VectorizeTag, filter_iname_tags_by_type + + def create_and_test(insn, exception=None, a=None, b=None): + a = np.zeros((3, 4), dtype=np.int32) if a is None else a + data = [lp.GlobalArg('a', shape=(12,), dtype=a.dtype)] + kwargs = dict(a=a) + if b is not None: + data += [lp.GlobalArg('b', shape=(12,), dtype=b.dtype)] + kwargs['b'] = b + names = [d.name for d in data] + + knl = lp.make_kernel(['{[i]: 0 <= i < 12}'], + """ + for i + %(insn)s + end + """ % dict(insn=insn), + data + ) + + knl = lp.split_iname(knl, 'i', 4, inner_tag='vec') + knl = lp.split_array_axis(knl, names, 0, 4) + knl = lp.tag_array_axes(knl, names, 'N0,vec') + knl = lp.preprocess_kernel(knl) + lp.generate_code_v2(knl).device_code() + assert knl.instructions[0].within_inames & set(['i_inner']) + assert isinstance(knl.args[0].dim_tags[-1], VectorArrayDimTag) + assert isinstance(knl.args[0].dim_tags[-1], VectorArrayDimTag) + assert filter_iname_tags_by_type(knl.iname_to_tags['i_inner'], VectorizeTag) + + def run(op_list=[], unary_operators=[], func_list=[], unary_funcs=[], + rvals=['1', 'a[i]']): + for op in op_list: + template = 'a[i] = a[i] %(op)s %(rval)s' \ + if op not in unary_operators else 'a[i] = %(op)s a[i]' + for rval in rvals: + create_and_test(template % dict(op=op, rval=rval)) + for func in func_list: + template = 'a[i] = %(func)s(a[i], %(rval)s)' \ + if func not in unary_funcs else 'a[i] = %(func)s(a[i])' + for rval in rvals: + create_and_test(template % dict(func=func, rval=rval)) + + # 1) comparisons + run(['>', '>=', '<', '<=', '==', '!=']) + + # 2) logical operators + run(['and', 'or', 'not'], ['not']) + + # 3) bitwise operators + # bitwise xor '^' not not implemented in codegen + run(['~', '|', '&'], ['~']) + + # 4) functions -- a random selection of the enabled math functions in opencl + run(func_list=['acos', 'exp10', 'atan2', 'round'], + unary_funcs=['round', 'acos', 'exp10']) + + # 5) remainders and floor division (use 4 instead of 1 to avoid pymbolic + # optimizing out the a[i] % 1) + run(['%', '//'], rvals=['a[i]', '4']) + + def test_check_for_variable_access_ordering(): knl = lp.make_kernel( "{[i]: 0<=i