diff --git a/loopy/expression.py b/loopy/expression.py index 3269bc09f064f57857eaa5218c8370383e0f735e..06fe3bb06b1071bb6e8a1a80b861197d9eb0ed5f 100644 --- a/loopy/expression.py +++ b/loopy/expression.py @@ -63,6 +63,30 @@ class VectorizabilityChecker(RecursiveMapper): .. attribute:: vec_iname """ + # this is a simple list of math functions from OpenCL-1.2 + # https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/mathFunctions.html + # this could be expanded / moved to it's own target specific VecCheck if + # necessary + functions = """acos acosh acospi asin + asinh asinpi atan atan2 + atanh atanpi atan2pi cbrt + ceil copysign cos cosh + cospi erfc erf exp + exp2 exp10 expm1 fabs + fdim floor fma fmax + fmin fmod fract frexp + hypot ilogb ldexp lgamma + lgamma_r log log2 log10 + log1p logb mad maxmag + minmag modf nan nextafter + pow pown powr remainder + remquo rint rootn round + rsqrt sin sincos sinh + sinpi sqrt tan tanh + tanpi tgamma trunc""" + + functions = [x.strip() for x in functions.split() if x.strip()] + def __init__(self, kernel, vec_iname, vec_iname_length): self.kernel = kernel self.vec_iname = vec_iname @@ -75,7 +99,7 @@ class VectorizabilityChecker(RecursiveMapper): return reduce(and_, vectorizabilities) def map_sum(self, expr): - return any(self.rec(child) for child in expr.children) + return any([self.rec(child) for child in expr.children]) map_product = map_sum @@ -84,6 +108,16 @@ class VectorizabilityChecker(RecursiveMapper): or self.rec(expr.denominator)) + map_remainder = map_quotient + + def map_floor_div(self, expr): + """ + (a) - ( ((a)<0) ? ((b)-1) : 0 ) ) / (b) + """ + a, b = expr.numerator, expr.denominator + return self.rec(a) and self.rec(a.lt(0)) and self.rec(b - 1) and \ + self.rec((a - (b - 1)) / b) and self.rec(a / b) + def map_linear_subscript(self, expr): return False @@ -93,10 +127,54 @@ class VectorizabilityChecker(RecursiveMapper): rec_pars = [ self.rec(child) for child in expr.parameters] if any(rec_pars): - raise Unvectorizable("fucntion calls cannot yet be vectorized") + if str(expr.function) not in VectorizabilityChecker.functions: + return Unvectorizable( + 'Function {} is not known to be vectorizable'.format( + str(expr.function))) + return True return False + @staticmethod + def compile_time_constants(kernel, vec_iname): + """ + Returns a dictionary of (non-vector) inames and temporary variables whose + value is known at "compile" time. These are used (in combination with a + codegen state's variable substitution map) to simplifying access expressions + in :func:`get_access_info`. + + Note: inames are mapped to the :class:`Variable` version of themselves, + while temporary variables are mapped to their integer value + + .. parameter:: kernel + The kernel to check + .. parameter:: vec_iname + the vector iname + + """ + + # determine allowed symbols as non-vector inames + from pymbolic.primitives import Variable + allowed_symbols = dict((sym, Variable(sym)) for sym in kernel.all_inames() + if sym != vec_iname) + from loopy.kernel.instruction import Assignment + from loopy.tools import is_integer + from six import iteritems + + # and compile time integer temporaries + compile_time_assign = dict((str(insn.assignee), insn.expression) + for insn in kernel.instructions if + isinstance(insn, Assignment) and is_integer( + insn.expression)) + allowed_symbols.update( + dict((sym, compile_time_assign[sym]) for sym, var in iteritems( + kernel.temporary_variables) + # temporary variables w/ no initializer, no shape + if var.initializer is None and not var.shape + # compile time integers + and sym in compile_time_assign)) + return allowed_symbols + def map_subscript(self, expr): name = expr.aggregate.name @@ -114,29 +192,45 @@ class VectorizabilityChecker(RecursiveMapper): index = expr.index_tuple - from loopy.symbolic import get_dependencies + from loopy.symbolic import get_dependencies, DependencyMapper from loopy.kernel.array import VectorArrayDimTag - from pymbolic.primitives import Variable possible = None for i in range(len(var.shape)): - if ( - isinstance(var.dim_tags[i], VectorArrayDimTag) - and isinstance(index[i], Variable) - and index[i].name == self.vec_iname): + dep_mapper = DependencyMapper(composite_leaves=False) + deps = dep_mapper(index[i]) + # if we're on the vector index + if isinstance(var.dim_tags[i], VectorArrayDimTag): if var.shape[i] != self.vec_iname_length: raise Unvectorizable("vector length was mismatched") - if possible is None: - possible = True - - else: - if self.vec_iname in get_dependencies(index[i]): - raise Unvectorizable("vectorizing iname '%s' occurs in " - "unvectorized subscript axis %d (1-based) of " - "expression '%s'" - % (self.vec_iname, i+1, expr)) - break + possible = self.vec_iname in [str(x) for x in deps] + # or, if not vector index, and vector iname is present + elif self.vec_iname in set(x.name for x in deps): + # check whether we can simplify out the vector iname + context = dict((str(x), x) for x in deps if x.name != self.vec_iname) + allowed_symbols = self.compile_time_constants( + self.kernel, self.vec_iname) + + from pymbolic import substitute + from pymbolic.mapper.evaluator import UnknownVariableError + from loopy.tools import is_integer + for veci in range(self.vec_iname_length): + ncontext = context.copy() + ncontext[self.vec_iname] = veci + try: + idi = substitute(index[i], ncontext) + if not is_integer(idi) and not all( + x in allowed_symbols + for x in get_dependencies(idi)): + raise Unvectorizable( + "vectorizing iname '%s' occurs in " + "unvectorized subscript axis %d (1-based) of " + "expression '%s', and could not be simplified" + "to compile-time constants." + % (self.vec_iname, i+1, expr)) + except UnknownVariableError: + break return bool(possible) @@ -160,16 +254,31 @@ class VectorizabilityChecker(RecursiveMapper): return False def map_comparison(self, expr): - # FIXME: These actually can be vectorized: # https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/relationalFunctions.html + # even better for OpenCL <, <=, >, >=, !=, == are all vectorizable by default + # (see: sec 6.3.d-6.d.3 in OpenCL-1.2 docs) + + if expr.operator in ["<", "<=", ">", ">=", "!=", "=="]: + return any([self.rec(x) for x in [expr.left, expr.right]]) + raise Unvectorizable() def map_logical_not(self, expr): - raise Unvectorizable() + # 6.3.h in OpenCL-1.2 docs + return self.rec(expr.child) + + def map_logical_and(self, expr): + # 6.3.h in OpenCL-1.2 docs + return any(self.rec(x) for x in expr.children) + + map_logical_or = map_logical_and - map_logical_and = map_logical_not - map_logical_or = map_logical_not + # sec 6.3.f in OpenCL-1.2 docs + map_bitwise_not = map_logical_not + map_bitwise_or = map_logical_and + map_bitwise_xor = map_logical_and + map_bitwise_and = map_logical_and def map_reduction(self, expr): # FIXME: Do this more carefully diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 3588f38af13479b127208c25735f1046eaa82706..5b7978a382aa144254a7ffbacedbab30f0ddf5eb 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -1213,33 +1213,58 @@ class AccessInfo(ImmutableRecord): """ -def get_access_info(target, ary, index, eval_expr, vectorization_info): +def get_access_info(target, ary, index, var_subst_map, vectorization_info): """ :arg ary: an object of type :class:`ArrayBase` :arg index: a tuple of indices representing a subscript into ary + :arg var_subst_map: a context of variable substitutions from the calling codegen + state and potentially other compile-time "constants" (inames and + integer temporaries w/ known values), used in detection of loads / shuffles :arg vectorization_info: an instance of :class:`loopy.codegen.VectorizationInfo`, or *None*. """ import loopy as lp from pymbolic import var + from loopy.codegen import Unvectorizable + from loopy.symbolic import get_dependencies - def eval_expr_assert_integer_constant(i, expr): + def eval_expr_assert_constant(i, expr, kwargs): from pymbolic.mapper.evaluator import UnknownVariableError + # determine error type -- if vectorization_info is None, we're in the + # unvec fallback (and should raise a LoopyError) + # if vectorization_info is 'True', we should raise an Unvectorizable + # on failure + error_type = LoopyError if vectorization_info is None else Unvectorizable + from pymbolic import evaluate try: - result = eval_expr(expr) + result = evaluate(expr, kwargs) except UnknownVariableError as e: - raise LoopyError("When trying to index the array '%s' along axis " + if vectorization_info: + # failed vectorization + raise Unvectorizable( + "When trying to vectorize the array '%s' along axis " "%d (tagged '%s'), the index was not a compile-time " "constant (but it has to be in order for code to be " - "generated). You likely want to unroll the iname(s) '%s'." + "generated). You likely want to unroll the iname(s) '%s'" % (ary.name, i, ary.dim_tags[i], str(e))) + else: + raise LoopyError( + "When trying to unroll the array '%s' along axis " + "%d (tagged '%s'), the index was not an unrollable-iname " + "or constant (but it has to be in order for code to be " + "generated). You likely want to unroll/change array index(s)" + " '%s'" % (ary.name, i, ary.dim_tags[i], str(e))) if not is_integer(result): - raise LoopyError("subscript '%s[%s]' has non-constant " + # try to simplify further + from loopy.isl_helpers import simplify_via_aff + result = simplify_via_aff(result) + + if any([x not in var_subst_map for x in get_dependencies(result)]): + raise error_type("subscript '%s[%s]' has non-constant " "index for separate-array axis %d (0-based)" % ( ary.name, index, i)) - return result def apply_offset(sub): @@ -1290,7 +1315,7 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info): for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)): if isinstance(dim_tag, SeparateArrayArrayDimTag): - idx = eval_expr_assert_integer_constant(i, idx) + idx = eval_expr_assert_constant(i, idx, var_subst_map) array_name += "_s%d" % idx # }}} @@ -1318,18 +1343,17 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info): elif isinstance(dim_tag, VectorArrayDimTag): from pymbolic.primitives import Variable - if (vectorization_info is not None - and isinstance(index[i], Variable) + if (vectorization_info and isinstance(index[i], Variable) and index[i].name == vectorization_info.iname): # We'll do absolutely nothing here, which will result # in the vector being returned. pass else: - idx = eval_expr_assert_integer_constant(i, idx) - - assert vector_index is None - vector_index = idx + if vector_index is None: + # if we haven't processed the vector index yet + idx = eval_expr_assert_constant(i, idx, var_subst_map) + vector_index = idx else: raise LoopyError("unsupported array dim implementation tag '%s' " diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 8ef921e447bf10d85ac60460f904d528ac64da19..dfe54ae4c9de313da496beb3764f38764ad7e464 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -182,16 +182,22 @@ class ExpressionToCExpressionMapper(IdentityMapper): ary = self.find_array(expr) - from loopy.kernel.array import get_access_info - from pymbolic import evaluate - from loopy.symbolic import simplify_using_aff index_tuple = tuple( simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple) + from loopy.kernel.array import get_access_info + from loopy.expression import VectorizabilityChecker + var_subst_map = self.codegen_state.var_subst_map.copy() + if self.codegen_state.vectorization_info: + ctc_iname = self.codegen_state.vectorization_info.iname + ctc = VectorizabilityChecker.compile_time_constants( + self.codegen_state.kernel, + ctc_iname) + var_subst_map.update(ctc) + access_info = get_access_info(self.kernel.target, ary, index_tuple, - lambda expr: evaluate(expr, self.codegen_state.var_subst_map), - self.codegen_state.vectorization_info) + var_subst_map, self.codegen_state.vectorization_info) from loopy.kernel.data import ( ImageArg, ArrayArg, TemporaryVariable, ConstantArg) @@ -411,10 +417,17 @@ class ExpressionToCExpressionMapper(IdentityMapper): ary = self.find_array(arg) from loopy.kernel.array import get_access_info - from pymbolic import evaluate + from loopy.expression import VectorizabilityChecker + var_subst_map = self.codegen_state.var_subst_map.copy() + if self.codegen_state.vectorization_info: + ctc_iname = self.codegen_state.vectorization_info.iname + ctc = VectorizabilityChecker.compile_time_constants( + self.codegen_state.kernel, + ctc_iname) + var_subst_map.update(ctc) + access_info = get_access_info(self.kernel.target, ary, arg.index, - lambda expr: evaluate(expr, self.codegen_state.var_subst_map), - self.codegen_state.vectorization_info) + var_subst_map, self.codegen_state.vectorization_info) from loopy.kernel.data import ImageArg if isinstance(ary, ImageArg): diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index cccee2301e44b16e2454bda5e98af7db7893c003..e28bd1765d31cb24029600aae186e7d626c42533 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -51,8 +51,8 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): def map_group_hw_index(self, expr, type_context): return var( - "((uniform %s) taskIndex%d)" - % (self._get_index_ctype(), expr.axis)) + "((uniform %s) taskIndex%d)" + % (self._get_index_ctype(), expr.axis)) def map_local_hw_index(self, expr, type_context): if expr.axis == 0: @@ -68,7 +68,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): return Literal(repr(float(expr))) elif type_context == "d": # Keepin' the good ideas flowin' since '66. - return Literal(repr(float(expr))+"d") + return Literal(repr(float(expr)) + "d") elif type_context == "i": return expr else: @@ -77,7 +77,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): return expr raise RuntimeError("don't know how to generate code " - "for constant '%s'" % expr) + "for constant '%s'" % expr) def map_variable(self, expr, type_context): tv = self.kernel.temporary_variables.get(expr.name) @@ -94,7 +94,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): else: return super(ExprToISPCExprMapper, self).map_variable( - expr, type_context) + expr, type_context) def map_subscript(self, expr, type_context): from loopy.kernel.data import TemporaryVariable @@ -109,15 +109,22 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): if lsize: lsize, = lsize from loopy.kernel.array import get_access_info - from pymbolic import evaluate - access_info = get_access_info(self.kernel.target, ary, expr.index, - lambda expr: evaluate(expr, self.codegen_state.var_subst_map), - self.codegen_state.vectorization_info) + var_subst_map = self.codegen_state.var_subst_map.copy() + if self.codegen_state.vectorization_info: + from loopy.expression import VectorizabilityChecker + ctc = VectorizabilityChecker.compile_time_constants( + self.codegen_state.kernel, + self.codegen_state.vectorization_info.iname) + var_subst_map.update(ctc) + + access_info = get_access_info( + self.kernel.target, ary, expr.index, + var_subst_map, self.codegen_state.vectorization_info) subscript, = access_info.subscripts result = var(access_info.array_name)[ - var("programIndex") + self.rec(lsize*subscript, 'i')] + var("programIndex") + self.rec(lsize * subscript, 'i')] if access_info.vector_index is not None: return self.kernel.target.add_vector_access( @@ -126,7 +133,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): return result return super(ExprToISPCExprMapper, self).map_subscript( - expr, type_context) + expr, type_context) # }}} @@ -139,11 +146,13 @@ def fill_registry_with_ispc_types(reg, respect_windows, include_bool=True): reg.get_or_register_dtype(["int8", "signed char", "char"], np.int8) reg.get_or_register_dtype(["uint8", "unsigned char"], np.uint8) reg.get_or_register_dtype(["int16", "short", "signed short", - "signed short int", "short signed int"], np.int16) + "signed short int", "short signed int"], np.int16) reg.get_or_register_dtype(["uint16", "unsigned short", - "unsigned short int", "short unsigned int"], np.uint16) + "unsigned short int", "short unsigned int"], + np.uint16) reg.get_or_register_dtype(["int32", "int", "signed int"], np.int32) - reg.get_or_register_dtype(["uint32", "unsigned", "unsigned int"], np.uint32) + reg.get_or_register_dtype( + ["uint32", "unsigned", "unsigned int"], np.uint32) reg.get_or_register_dtype(["int64"], np.int64) reg.get_or_register_dtype(["uint64"], np.uint64) @@ -177,9 +186,10 @@ class ISPCTarget(CTarget): if len(lsize) > 1: for i, ls_i in enumerate(lsize[1:]): if ls_i != 1: - raise LoopyError("local axis %d (0-based) " - "has length > 1, which is unsupported " - "by ISPC" % ls_i) + raise LoopyError( + "local axis %d (0-based) " + "has length > 1, which is unsupported " + "by ISPC" % ls_i) def get_host_ast_builder(self): return ISPCASTBuilder(self) @@ -194,7 +204,7 @@ class ISPCTarget(CTarget): from loopy.target.c.compyte.dtypes import DTypeRegistry result = DTypeRegistry() fill_registry_with_ispc_types(result, respect_windows=False, - include_bool=True) + include_bool=True) return result # }}} @@ -206,8 +216,8 @@ class ISPCASTBuilder(CASTBuilder): arg_names = [iai.name for iai in implemented_data_info] arg_decls = [ - self.idi_to_cgen_declarator(codegen_state.kernel, idi) - for idi in implemented_data_info] + self.idi_to_cgen_declarator(codegen_state.kernel, idi) + for idi in implemented_data_info] # {{{ occa compatibility hackery @@ -217,11 +227,10 @@ class ISPCASTBuilder(CASTBuilder): from cgen.ispc import ISPCUniform arg_decls = [ - Const(ISPCUniform(ArrayOf(Value("int", "loopy_dims")))), - Const(ISPCUniform(Value("int", "o1"))), - Const(ISPCUniform(Value("int", "o2"))), - Const(ISPCUniform(Value("int", "o3"))), - ] + arg_decls + Const(ISPCUniform(ArrayOf(Value("int", "loopy_dims")))), + Const(ISPCUniform(Value("int", "o1"))), + Const(ISPCUniform(Value("int", "o2"))), + Const(ISPCUniform(Value("int", "o3")))] + arg_decls arg_names = ["loopy_dims", "o1", "o2", "o3"] + arg_names # }}} @@ -231,7 +240,7 @@ class ISPCASTBuilder(CASTBuilder): # {{{ top-level codegen def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + schedule_index): name = codegen_result.current_program(codegen_state).name from cgen import (FunctionDeclaration, Value) @@ -241,14 +250,14 @@ class ISPCASTBuilder(CASTBuilder): if codegen_state.is_generating_device_code: result = ISPCTask( - FunctionDeclaration( - Value("void", name), - arg_decls)) + FunctionDeclaration( + Value("void", name), + arg_decls)) else: result = ISPCExport( - FunctionDeclaration( - Value("void", name), - arg_decls)) + FunctionDeclaration( + Value("void", name), + arg_decls)) from loopy.target.c import FunctionDeclarationWrapper return FunctionDeclarationWrapper(result) @@ -263,20 +272,19 @@ class ISPCASTBuilder(CASTBuilder): from cgen import Statement as S, Block if lsize: result.append( - S( - "assert(programCount == (%s))" - % ecm(lsize[0], PREC_NONE))) + S( + "assert(programCount == (%s))" + % ecm(lsize[0], PREC_NONE))) arg_names, arg_decls = self._arg_names_and_decls(codegen_state) from cgen.ispc import ISPCLaunch result.append( - ISPCLaunch( - tuple(ecm(gs_i, PREC_NONE) for gs_i in gsize), - "%s(%s)" % ( - name, - ", ".join(arg_names) - ))) + ISPCLaunch( + tuple(ecm(gs_i, PREC_NONE) for gs_i in gsize), + "%s(%s)" % ( + name, + ", ".join(arg_names)))) return Block(result) @@ -319,9 +327,9 @@ class ISPCASTBuilder(CASTBuilder): from cgen import ArrayOf ecm = self.get_expression_to_code_mapper(codegen_state) temp_var_decl = ArrayOf( - temp_var_decl, - ecm(p.flattened_product(shape), - prec=PREC_NONE, type_context="i")) + temp_var_decl, + ecm(p.flattened_product(shape), + prec=PREC_NONE, type_context="i")) return temp_var_decl @@ -346,13 +354,13 @@ class ISPCASTBuilder(CASTBuilder): def get_global_arg_decl(self, name, shape, dtype, is_written): from warnings import warn warn("get_global_arg_decl is deprecated use get_array_arg_decl " - "instead.", DeprecationWarning, stacklevel=2) + "instead.", DeprecationWarning, stacklevel=2) return self.get_array_arg_decl(name, AddressSpace.GLOBAL, shape, dtype, is_written) def get_value_arg_decl(self, name, shape, dtype, is_written): result = super(ISPCASTBuilder, self).get_value_arg_decl( - name, shape, dtype, is_written) + name, shape, dtype, is_written) from cgen import Reference, Const was_const = isinstance(result, Const) @@ -386,8 +394,8 @@ class ISPCASTBuilder(CASTBuilder): rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype) rhs_code = ecm(insn.expression, prec=PREC_NONE, - type_context=rhs_type_context, - needed_dtype=lhs_dtype) + type_context=rhs_type_context, + needed_dtype=lhs_dtype) lhs = insn.assignee @@ -397,21 +405,28 @@ class ISPCASTBuilder(CASTBuilder): ary = ecm.find_array(lhs) from loopy.kernel.array import get_access_info - from pymbolic import evaluate from loopy.symbolic import simplify_using_aff index_tuple = tuple( - simplify_using_aff(kernel, idx) for idx in lhs.index_tuple) + simplify_using_aff(kernel, idx) for idx in lhs.index_tuple) + + var_subst_map = codegen_state.var_subst_map.copy() + if codegen_state.vectorization_info: + from loopy.expression import VectorizabilityChecker + ctc = VectorizabilityChecker.compile_time_constants( + codegen_state.kernel, + codegen_state.vectorization_info.iname) + var_subst_map.update(ctc) - access_info = get_access_info(kernel.target, ary, index_tuple, - lambda expr: evaluate(expr, codegen_state.var_subst_map), - codegen_state.vectorization_info) + access_info = get_access_info( + kernel.target, ary, index_tuple, + var_subst_map, codegen_state.vectorization_info) from loopy.kernel.data import ArrayArg, TemporaryVariable if not isinstance(ary, (ArrayArg, TemporaryVariable)): raise LoopyError("array type not supported in ISPC: %s" - % type(ary).__name) + % type(ary).__name) if len(access_info.subscripts) != 1: raise LoopyError("streaming stores must have a subscript") @@ -430,8 +445,8 @@ class ISPCASTBuilder(CASTBuilder): saw_l0 = False for term in terms: - if (isinstance(term, Variable) - and kernel.iname_tags_of_type(term.name, LocalIndexTag)): + if (isinstance(term, Variable) and kernel.iname_tags_of_type( + term.name, LocalIndexTag)): tag, = kernel.iname_tags_of_type( term.name, LocalIndexTag, min_num=1, max_num=1) if tag.axis == 0: @@ -456,11 +471,11 @@ class ISPCASTBuilder(CASTBuilder): if not saw_l0: raise LoopyError("streaming store must have stride 1 in " - "local index, got: %s" % subscript) + "local index, got: %s" % subscript) if access_info.vector_index is not None: raise LoopyError("streaming store may not use a short-vector " - "data type") + "data type") rhs_has_programindex = any( isinstance(tag, LocalIndexTag) and tag.axis == 0 @@ -472,11 +487,11 @@ class ISPCASTBuilder(CASTBuilder): from cgen import Statement return Statement( - "streaming_store(%s + %s, %s)" - % ( - access_info.array_name, - ecm(flattened_sum(new_terms), PREC_NONE, 'i'), - rhs_code)) + "streaming_store(%s + %s, %s)" + % ( + access_info.array_name, + ecm(flattened_sum(new_terms), PREC_NONE, 'i'), + rhs_code)) # }}} @@ -484,7 +499,7 @@ class ISPCASTBuilder(CASTBuilder): return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - lbound, ubound, inner): + lbound, ubound, inner): ecm = codegen_state.expression_to_code_mapper from loopy.target.c import POD @@ -495,14 +510,14 @@ class ISPCASTBuilder(CASTBuilder): from cgen.ispc import ISPCUniform return For( - InlineInitializer( - ISPCUniform(POD(self, iname_dtype, iname)), - ecm(lbound, PREC_NONE, "i")), - ecm( - p.Comparison(var(iname), "<=", ubound), - PREC_NONE, "i"), - "++%s" % iname, - inner) + InlineInitializer( + ISPCUniform(POD(self, iname_dtype, iname)), + ecm(lbound, PREC_NONE, "i")), + ecm( + p.Comparison(var(iname), "<=", ubound), + PREC_NONE, "i"), + "++%s" % iname, + inner) # }}} diff --git a/test/test_loopy.py b/test/test_loopy.py index 80af89f3bb09f2d2bf394279115acfa0fe928e2b..47f86e196463127600fd6ee63cd4861cded6abe1 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2800,6 +2800,114 @@ def test_add_prefetch_works_in_lhs_index(): assert "a1_map" not in get_dependencies(insn.assignees) +def test_vectorizability(): + # check new vectorizability conditions + from loopy.kernel.array import VectorArrayDimTag + from loopy.kernel.data import VectorizeTag, filter_iname_tags_by_type + + def create_and_test(insn, exception=None, a=None, b=None): + a = np.zeros((3, 4), dtype=np.int32) if a is None else a + data = [lp.GlobalArg('a', shape=(12,), dtype=a.dtype)] + kwargs = dict(a=a) + if b is not None: + data += [lp.GlobalArg('b', shape=(12,), dtype=b.dtype)] + kwargs['b'] = b + names = [d.name for d in data] + + knl = lp.make_kernel(['{[i]: 0 <= i < 12}'], + """ + for i + %(insn)s + end + """ % dict(insn=insn), + data + ) + + knl = lp.split_iname(knl, 'i', 4, inner_tag='vec') + knl = lp.split_array_axis(knl, names, 0, 4) + knl = lp.tag_array_axes(knl, names, 'N0,vec') + knl = lp.preprocess_kernel(knl) + lp.generate_code_v2(knl).device_code() + assert knl.instructions[0].within_inames & set(['i_inner']) + assert isinstance(knl.args[0].dim_tags[-1], VectorArrayDimTag) + assert isinstance(knl.args[0].dim_tags[-1], VectorArrayDimTag) + assert filter_iname_tags_by_type(knl.iname_to_tags['i_inner'], VectorizeTag) + + def run(op_list=[], unary_operators=[], func_list=[], unary_funcs=[], + rvals=['1', 'a[i]']): + for op in op_list: + template = 'a[i] = a[i] %(op)s %(rval)s' \ + if op not in unary_operators else 'a[i] = %(op)s a[i]' + for rval in rvals: + create_and_test(template % dict(op=op, rval=rval)) + for func in func_list: + template = 'a[i] = %(func)s(a[i], %(rval)s)' \ + if func not in unary_funcs else 'a[i] = %(func)s(a[i])' + for rval in rvals: + create_and_test(template % dict(func=func, rval=rval)) + + # 1) comparisons + run(['>', '>=', '<', '<=', '==', '!=']) + + # 2) logical operators + run(['and', 'or', 'not'], ['not']) + + # 3) bitwise operators + # bitwise xor '^' not not implemented in codegen + run(['~', '|', '&'], ['~']) + + # 4) functions -- a random selection of the enabled math functions in opencl + run(func_list=['acos', 'exp10', 'atan2', 'round'], + unary_funcs=['round', 'acos', 'exp10']) + + # 5) remainders and floor division (use 4 instead of 1 to avoid pymbolic + # optimizing out the a[i] % 1) + run(['%', '//'], rvals=['a[i]', '4']) + + # 6) check vectorizability of subscripts w/ compile-time constants directly + def _get_offset_kernel(as_temporary=True): + data = [lp.GlobalArg('a', shape=(12, 16), dtype=np.int32)] + if as_temporary: + pre = '<> c = 4' + else: + pre = '' + data.append(lp.ValueArg('c', dtype=np.int32)) + + # make a kernel + knl = lp.make_kernel(['{[i,j]: 0 <= i,j < 12}'], + """ + {pre} + a[j, i + c] = 1 + """.format(pre=pre), data) + + knl = lp.split_iname(knl, 'i', 4, inner_tag='vec') + knl = lp.split_array_axis(knl, 'a', 1, 4) + knl = lp.tag_array_axes(knl, 'a', 'N1,N0,vec') + knl = lp.preprocess_kernel(knl) + return knl + + # get checker + from loopy.expression import VectorizabilityChecker + from loopy.diagnostic import LoopyError + # test CTC's + knl = _get_offset_kernel() + assert (set(VectorizabilityChecker.compile_time_constants(knl, 'i_inner').keys()) + == set(['j', 'c', 'i_outer'])) + # test that the VC doesn't throw an Unvectorizable + VectorizabilityChecker(knl, 'i', 4)(knl.instructions[0].assignee) + + # and finally test that we can generate code + with pytest.raises(LoopyError): + # This test is broken in this MR as the shuffle / load logic in + # `get_access_info` is in a forthcoming MR + print(lp.generate_code_v2(knl).device_code()) + + # fix the parameter and allow vectorization + knl = _get_offset_kernel(False) + knl = lp.fix_parameters(knl, c=4) + print(lp.generate_code_v2(knl).device_code()) + + def test_check_for_variable_access_ordering(): knl = lp.make_kernel( "{[i]: 0<=i