diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 26f225c6293e62ef43adc921bfef20a3a2a534b1..37c8a12ee125fb26c54e84918a538c8d36e0cb4a 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -24,7 +24,7 @@ THE SOFTWARE. import six -from loopy.diagnostic import LoopyError +from loopy.diagnostic import LoopyError, warn from pytools import Record import islpy as isl @@ -167,8 +167,26 @@ class SeenFunction(Record): # {{{ code generation state +class Unvectorizable(Exception): + pass + + +class VectorizationInfo(object): + """ + .. attribute:: iname + .. attribute:: length + .. attribute:: space + """ + + def __init__(self, iname, length, space): + self.iname = iname + self.length = length + self.space = space + + class CodeGenerationState(object): """ + .. attribute:: kernel .. attribute:: implemented_domain The entire implemented domain (as an :class:`islpy.Set`) @@ -179,26 +197,79 @@ class CodeGenerationState(object): A :class:`frozenset` of predicates for which checks have been implemented. - .. attribute:: expression_to_code_mapper + .. attribute:: seen_dtypes + + set of dtypes that were encountered + + .. attribute:: seen_functions - A :class:`loopy.codegen.expression.CCodeMapper` that does not take - per-ILP assignments into account. + set of :class:`SeenFunction` instances + + .. attribute:: var_subst_map + + .. attribute:: allow_complex + + .. attribute:: vectorization_info + + None or an instance of :class:`VectorizationInfo` """ - def __init__(self, implemented_domain, implemented_predicates, - expression_to_code_mapper): + + def __init__(self, kernel, implemented_domain, implemented_predicates, + seen_dtypes, seen_functions, var_subst_map, + allow_complex, + vectorization_info=None): + self.kernel = kernel self.implemented_domain = implemented_domain self.implemented_predicates = implemented_predicates - self.expression_to_code_mapper = expression_to_code_mapper + self.seen_dtypes = seen_dtypes + self.seen_functions = seen_functions + self.var_subst_map = var_subst_map.copy() + self.allow_complex = allow_complex + self.vectorization_info = vectorization_info + + # {{{ copy helpers def copy(self, implemented_domain=None, implemented_predicates=frozenset(), - expression_to_code_mapper=None): + var_subst_map=None, vectorization_info=None): + + if vectorization_info is False: + vectorization_info = None + + elif vectorization_info is None: + vectorization_info = self.vectorization_info + return CodeGenerationState( + kernel=self.kernel, implemented_domain=implemented_domain or self.implemented_domain, implemented_predicates=( implemented_predicates or self.implemented_predicates), - expression_to_code_mapper=( - expression_to_code_mapper - or self.expression_to_code_mapper)) + seen_dtypes=self.seen_dtypes, + seen_functions=self.seen_functions, + var_subst_map=var_subst_map or self.var_subst_map, + allow_complex=self.allow_complex, + vectorization_info=vectorization_info) + + def copy_and_assign(self, name, value): + """Make a copy of self with variable *name* fixed to *value*.""" + var_subst_map = self.var_subst_map.copy() + var_subst_map[name] = value + return self.copy(var_subst_map=var_subst_map) + + def copy_and_assign_many(self, assignments): + """Make a copy of self with *assignments* included.""" + + var_subst_map = self.var_subst_map.copy() + var_subst_map.update(assignments) + return self.copy(var_subst_map=var_subst_map) + + # }}} + + @property + def expression_to_code_mapper(self): + # It's kind of unfortunate that this is here, but it's an accident + # of history for now. + + return self.kernel.target.get_expression_to_code_mapper(self) def intersect(self, other): new_impl, new_other = isl.align_two(self.implemented_domain, other) @@ -225,11 +296,43 @@ class CodeGenerationState(object): expr = pw_aff_to_expr(aff) new_impl_domain = new_impl_domain.add_constraint(cns) - return self.copy( - implemented_domain=new_impl_domain, - expression_to_code_mapper=( - self.expression_to_code_mapper.copy_and_assign(iname, expr))) + return self.copy_and_assign(iname, expr).copy( + implemented_domain=new_impl_domain) + + def try_vectorized(self, what, func): + """If *self* is in a vectorizing state (:attr:`vectorization_info` is + not None), tries to call func (which must be a callable accepting a + single :class:`CodeGenerationState` argument). If this fails with + :exc:`Unvectorizable`, it unrolls the vectorized loop instead. + + *func* should return a :class:`GeneratedCode` instance. + + :returns: :class:`GeneratedCode` + """ + + if self.vectorization_info is None: + return func(self) + + try: + return func(self) + except Unvectorizable as e: + warn(self.kernel, "vectorize_failed", + "Vectorization of '%s' failed because '%s'" + % (what, e)) + + return self.unvectorize(func) + + def unvectorize(self, func): + vinf = self.vectorization_info + result = [] + novec_self = self.copy(vectorization_info=False) + + for i in range(vinf.length): + idx_aff = isl.Aff.zero_on_domain(vinf.space.params()) + i + new_codegen_state = novec_self.fix(vinf.iname, idx_aff) + result.append(func(new_codegen_state)) + return gen_code_block(result) # }}} @@ -431,10 +534,13 @@ def generate_code(kernel, device=None): initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( + kernel=kernel, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), - expression_to_code_mapper=kernel.target.get_expression_to_code_mapper( - kernel, seen_dtypes, seen_functions, allow_complex)) + seen_dtypes=seen_dtypes, + seen_functions=seen_functions, + var_subst_map={}, + allow_complex=allow_complex) code_str, implemented_domains = kernel.target.generate_code( kernel, codegen_state, impl_arg_info) @@ -517,10 +623,13 @@ def generate_body(kernel): initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( + kernel=kernel, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), - expression_to_code_mapper=kernel.target.get_expression_to_code_mapper( - kernel, seen_dtypes, seen_functions, allow_complex)) + seen_dtypes=seen_dtypes, + seen_functions=seen_functions, + var_subst_map={}, + allow_complex=allow_complex) code_str, implemented_domains = kernel.target.generate_body( kernel, codegen_state) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index b28244a8c01e72611244f33901793c37804b00ad..a613e4882ecb916d9088173851b86e5461325c57 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -61,12 +61,15 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state): from loopy.codegen.loop import ( generate_unroll_loop, + generate_vectorize_loop, generate_sequential_loop_dim_code) from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag, - LoopedIlpTag) + LoopedIlpTag, VectorizeTag) if isinstance(tag, (UnrollTag, UnrolledIlpTag)): func = generate_unroll_loop + elif isinstance(tag, VectorizeTag): + func = generate_vectorize_loop elif tag is None or isinstance(tag, (LoopedIlpTag, ForceSequentialTag)): func = generate_sequential_loop_dim_code else: @@ -92,7 +95,9 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state): insn = kernel.id_to_insn[sched_item.insn_id] from loopy.codegen.instruction import generate_instruction_code - return generate_instruction_code(kernel, insn, codegen_state) + return codegen_state.try_vectorized( + "instruction %s" % insn.id, + lambda inner_cgs: generate_instruction_code(kernel, insn, inner_cgs)) else: raise RuntimeError("unexpected schedule item type: %s" @@ -346,27 +351,67 @@ def build_loop_nest(kernel, sched_index, codegen_state): else: if group_length == 1: # group only contains starting schedule item - result = [generate_code_for_sched_index( - kernel, sched_index, new_codegen_state)] - if result == [None]: - result = [] + def gen_code(inner_codegen_state): + inner = generate_code_for_sched_index( + kernel, sched_index, inner_codegen_state) + + if inner is None: + return [] + else: + return [inner] + else: # recurse with a bigger done_group_lengths - result = build_insn_group( - sched_index_info_entries[0:group_length], - new_codegen_state, - done_group_lengths=done_group_lengths | set([group_length])) + def gen_code(inner_codegen_state): + return build_insn_group( + sched_index_info_entries[0:group_length], + inner_codegen_state, + done_group_lengths=( + done_group_lengths | set([group_length]))) + + # gen_code returns a list if bounds_checks or pred_checks: from loopy.codegen import wrap_in_if from loopy.codegen.bounds import constraint_to_code - conditionals = [ - constraint_to_code( - codegen_state.expression_to_code_mapper, cns) - for cns in bounds_checks] + list(pred_checks) + prev_gen_code = gen_code + + def gen_code(inner_codegen_state): + conditionals = [ + constraint_to_code( + inner_codegen_state.expression_to_code_mapper, cns) + for cns in bounds_checks] + list(pred_checks) + + prev_result = prev_gen_code(inner_codegen_state) + + return [wrap_in_if( + conditionals, + gen_code_block(prev_result))] - result = [wrap_in_if(conditionals, gen_code_block(result))] + cannot_vectorize = False + if new_codegen_state.vectorization_info is not None: + from loopy.isl_helpers import obj_involves_variable + for cond in bounds_checks: + if obj_involves_variable( + cond, + new_codegen_state.vectorization_info.iname): + cannot_vectorize = True + break + + if cannot_vectorize: + def gen_code_wrapper(inner_codegen_state): + # gen_code returns a list, but this needs to return a + # GeneratedCode instance. + + return gen_code_block(gen_code(inner_codegen_state)) + + result = [new_codegen_state.unvectorize(gen_code_wrapper)] + else: + result = gen_code(new_codegen_state) + + else: + result = gen_code(new_codegen_state) return result + build_insn_group( sched_index_info_entries[group_length:], codegen_state) diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index b98716a4e6281ff559abc670a8e1b0fe5d67976d..75d71e4cb6b0f0240c3bf568c19eb890c5823484 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -27,7 +27,7 @@ THE SOFTWARE. from six.moves import range import islpy as isl -from loopy.codegen import GeneratedInstruction +from loopy.codegen import GeneratedInstruction, Unvectorizable from pymbolic.mapper.stringifier import PREC_NONE @@ -90,13 +90,30 @@ def generate_instruction_code(kernel, insn, codegen_state): def generate_expr_instruction_code(kernel, insn, codegen_state): ecm = codegen_state.expression_to_code_mapper + from loopy.expression import dtype_to_type_context, VectorizabilityChecker + + if codegen_state.vectorization_info: + vinfo = codegen_state.vectorization_info + vcheck = VectorizabilityChecker( + kernel, vinfo.iname, vinfo.length) + rhs_is_vector = vcheck(insn.assignee) + lhs_is_vector = vcheck(insn.expression) + + if lhs_is_vector != rhs_is_vector: + raise Unvectorizable( + "LHS and RHS disagree on whether they are vectors") + + is_vector = lhs_is_vector + + del lhs_is_vector + del rhs_is_vector + expr = insn.expression (assignee_var_name, assignee_indices), = insn.assignees_and_indices() target_dtype = kernel.get_var_descriptor(assignee_var_name).dtype from cgen import Assign - from loopy.expression import dtype_to_type_context lhs_code = ecm(insn.assignee, prec=PREC_NONE, type_context=None) result = Assign( lhs_code, @@ -105,6 +122,9 @@ def generate_expr_instruction_code(kernel, insn, codegen_state): needed_dtype=target_dtype)) if kernel.options.trace_assignments or kernel.options.trace_assignment_values: + if codegen_state.vectorization_info and is_vector: + raise Unvectorizable("tracing does not support vectorization") + from cgen import Statement as S # noqa gs, ls = kernel.get_grid_sizes() @@ -160,6 +180,9 @@ def generate_expr_instruction_code(kernel, insn, codegen_state): def generate_c_instruction_code(kernel, insn, codegen_state): + if codegen_state.vectorization_info is not None: + raise Unvectorizable("C instructions cannot be vectorized") + ecm = codegen_state.expression_to_code_mapper body = [] diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 33cdc2118e389ae7b04e43e28a4c6d0e62011751..936366030a239822d77c0265f698edcbd56ef695 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -25,6 +25,7 @@ THE SOFTWARE. """ +from loopy.diagnostic import warn, LoopyError from loopy.codegen import gen_code_block import islpy as isl from islpy import dim_type @@ -125,8 +126,14 @@ def generate_unroll_loop(kernel, sched_index, codegen_state): static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr - length = int(pw_aff_to_expr( - static_max_of_pw_aff(bounds.size, constants_only=True))) + length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) + + if not length_aff.is_cst(): + raise LoopyError( + "length of unrolled loop '%s' is not a constant, " + "cannot unroll") + + length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( @@ -148,6 +155,63 @@ def generate_unroll_loop(kernel, sched_index, codegen_state): # }}} +# {{{ vectorized loops + +def generate_vectorize_loop(kernel, sched_index, codegen_state): + iname = kernel.schedule[sched_index].iname + + bounds = kernel.get_iname_bounds(iname, constants_only=True) + + from loopy.isl_helpers import ( + static_max_of_pw_aff, static_value_of_pw_aff) + from loopy.symbolic import pw_aff_to_expr + + length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) + + if not length_aff.is_cst(): + warn(kernel, "vec_upper_not_const", + "upper bound for vectorized loop '%s' is not a constant, " + "cannot vectorize--unrolling instead") + return generate_unroll_loop(kernel, sched_index, codegen_state) + + length = int(pw_aff_to_expr(length_aff)) + + try: + lower_bound_aff = static_value_of_pw_aff( + bounds.lower_bound_pw_aff.coalesce(), + constants_only=False) + except Exception as e: + raise type(e)("while finding lower bound of '%s': " % iname) + + if not lower_bound_aff.plain_is_zero(): + warn(kernel, "vec_lower_not_0", + "lower bound for vectorized loop '%s' is not zero, " + "cannot vectorize--unrolling instead") + return generate_unroll_loop(kernel, sched_index, codegen_state) + + # {{{ 'implement' vectorization bounds + + domain = kernel.get_inames_domain(iname) + + from loopy.isl_helpers import make_slab + slab = make_slab(domain.get_space(), iname, + lower_bound_aff, lower_bound_aff+length) + codegen_state = codegen_state.intersect(slab) + + # }}} + + from loopy.codegen import VectorizationInfo + new_codegen_state = codegen_state.copy( + vectorization_info=VectorizationInfo( + iname=iname, + length=length, + space=length_aff.space)) + + return build_loop_nest(kernel, sched_index+1, new_codegen_state) + +# }}} + + def intersect_kernel_with_slab(kernel, slab, iname): from loopy.kernel.tools import DomainChanger @@ -249,10 +313,7 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, # Have the conditional infrastructure generate the # slabbing conditionals. slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname) - new_codegen_state = codegen_state.copy( - expression_to_code_mapper=( - codegen_state.expression_to_code_mapper.copy_and_assign( - iname, hw_axis_expr))) + new_codegen_state = codegen_state.copy_and_assign(iname, hw_axis_expr) inner = set_up_hw_parallel_loops( slabbed_kernel, sched_index, diff --git a/loopy/expression.py b/loopy/expression.py index d912d528a7cc6a567db5aca684103eb736737030..7cf719f8d806d0f7d2de9471d41a559643ea2f11 100644 --- a/loopy/expression.py +++ b/loopy/expression.py @@ -25,10 +25,13 @@ THE SOFTWARE. import numpy as np -from pymbolic.mapper import CombineMapper +from pymbolic.mapper import CombineMapper, RecursiveMapper from loopy.tools import is_integer -from loopy.diagnostic import TypeInferenceFailure, DependencyTypeInferenceFailure +from loopy.codegen import Unvectorizable +from loopy.diagnostic import ( + LoopyError, + TypeInferenceFailure, DependencyTypeInferenceFailure) # type_context may be: @@ -257,4 +260,129 @@ class TypeInferenceMapper(CombineMapper): # }}} + +# {{{ vetorizability checker + +class VectorizabilityChecker(RecursiveMapper): + """The return value from this mapper is a :class:`bool` indicating whether + the result of the expression is vectorized along :attr:`vec_iname`. + If the expression is not vectorizable, the mapper raises :class:`Unvectorizable`. + + .. attribute:: vec_iname + """ + + def __init__(self, kernel, vec_iname, vec_iname_length): + self.kernel = kernel + self.vec_iname = vec_iname + self.vec_iname_length = vec_iname_length + + @staticmethod + def combine(vectorizabilities): + from functools import reduce + from operator import and_ + return reduce(and_, vectorizabilities) + + def map_sum(self, expr): + return any(self.rec(child) for child in expr.children) + + map_product = map_sum + + def map_quotient(self, expr): + return (self.rec(expr.numerator) + or + self.rec(expr.denominator)) + + def map_linear_subscript(self, expr): + return False + + def map_call(self, expr): + # FIXME: Should implement better vectorization check for function calls + + rec_pars = [ + self.rec(child) for child in expr.parameters] + if any(rec_pars): + raise Unvectorizable("fucntion calls cannot yet be vectorized") + + return False + + def map_subscript(self, expr): + name = expr.aggregate.name + + var = self.kernel.arg_dict.get(name) + if var is None: + var = self.kernel.temporary_variables.get(name) + + if var is None: + raise LoopyError("unknown array variable in subscript: %s" + % name) + + from loopy.kernel.array import ArrayBase + if not isinstance(var, ArrayBase): + raise LoopyError("non-array subscript '%s'" % expr) + + index = expr.index + if not isinstance(index, tuple): + index = (index,) + + from loopy.symbolic import get_dependencies + from loopy.kernel.array import VectorArrayDimTag + from pymbolic.primitives import Variable + + possible = None + for i in range(len(var.shape)): + if ( + isinstance(var.dim_tags[i], VectorArrayDimTag) + and isinstance(index[i], Variable) + and index[i].name == self.vec_iname): + if var.shape[i] != self.vec_iname_length: + raise Unvectorizable("vector length was mismatched") + + if possible is None: + possible = True + + else: + if self.vec_iname in get_dependencies(index[i]): + raise Unvectorizable("other vectorization iname " + "dependencies in subscript") + break + + return bool(possible) + + def map_constant(self, expr): + return False + + def map_variable(self, expr): + if expr.name == self.vec_iname: + # Technically, this is doable. But we're not going there. + raise Unvectorizable() + + # A single variable is always a scalar. + return False + + map_tagged_variable = map_variable + + def map_lookup(self, expr): + if self.rec(expr.aggregate): + raise Unvectorizable() + + return False + + def map_comparison(self, expr): + # FIXME: These actually can be vectorized: + # https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/relationalFunctions.html + + raise Unvectorizable() + + def map_logical_not(self, expr): + raise Unvectorizable() + + map_logical_and = map_logical_not + map_logical_or = map_logical_not + + def map_reduction(self, expr): + # FIXME: Do this more carefully + raise Unvectorizable() + +# }}} + # vim: fdm=marker diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 2533c0f2cb16e755506124acf470ba187eb6dce4..e8409a9ce8a8bd20f2e2df81c20ce56b167ab090 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -371,4 +371,17 @@ def project_out(set, inames): return set + +def obj_involves_variable(obj, var_name): + loc = obj.get_var_dict().get(var_name) + if loc is not None: + if not obj.get_coefficient_val(*loc).is_zero(): + return True + + for idiv in obj.dim(dim_type.div): + if obj_involves_variable(obj.get_div(idiv)): + return True + + return False + # vim: foldmethod=marker diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 3f5340d645495de42e8e384dc07c34d138be87f4..2923c945c08c6a5fc2d19831efcdf235ea949a72 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -876,7 +876,7 @@ class ArrayBase(Record): def decl_info(self, target, is_written, index_dtype): """Return a list of :class:`loopy.codegen.ImplementedDataInfo` - instances corresponding to the argume + instances corresponding to the array. """ from loopy.codegen import ImplementedDataInfo @@ -1094,16 +1094,18 @@ class AccessInfo(Record): """ -def get_access_info(target, ary, index, eval_expr): +def get_access_info(target, ary, index, eval_expr, vectorization_info): """ :arg ary: an object of type :class:`ArrayBase` :arg index: a tuple of indices representing a subscript into ary + :arg vectorization_info: an instance of :class:`loopy.codegen.VectorizationInfo`, + or *None*. """ - def eval_expr_wrapper(i, expr): + def eval_expr_assert_integer_constant(i, expr): from pymbolic.mapper.evaluator import UnknownVariableError try: - return eval_expr(expr) + result = eval_expr(expr) except UnknownVariableError as e: raise LoopyError("When trying to index the array '%s' along axis " "%d (tagged '%s'), the index was not a compile-time " @@ -1111,6 +1113,13 @@ def get_access_info(target, ary, index, eval_expr): "generated). You likely want to unroll the iname(s) '%s'." % (ary.name, i, ary.dim_tags[i], str(e))) + if not is_integer(result): + raise LoopyError("subscript '%s[%s]' has non-constant " + "index for separate-array axis %d (0-based)" % ( + ary.name, index, i)) + + return result + if not isinstance(index, tuple): index = (index,) @@ -1142,11 +1151,7 @@ def get_access_info(target, ary, index, eval_expr): for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)): if isinstance(dim_tag, SeparateArrayArrayDimTag): - idx = eval_expr_wrapper(i, idx) - if not is_integer(idx): - raise LoopyError("subscript '%s[%s]' has non-constant " - "index for separate-array axis %d (0-based)" % ( - ary.name, index, i)) + idx = eval_expr_assert_integer_constant(i, idx) array_name += "_s%d" % idx # }}} @@ -1176,14 +1181,19 @@ def get_access_info(target, ary, index, eval_expr): pass elif isinstance(dim_tag, VectorArrayDimTag): - idx = eval_expr_wrapper(i, idx) - - if not is_integer(idx): - raise LoopyError("subscript '%s[%s]' has non-constant " - "index for separate-array axis %d (0-based)" % ( - ary.name, index, i)) - assert vector_index is None - vector_index = idx + from pymbolic.primitives import Variable + if (vectorization_info is not None + and isinstance(index[i], Variable) + and index[i].name == vectorization_info.iname): + # We'll do absolutely nothing here, which will result + # in the vector being returned. + pass + + else: + idx = eval_expr_assert_integer_constant(i, idx) + + assert vector_index is None + vector_index = idx else: raise LoopyError("unsupported array dim implementation tag '%s' " diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 32c062689b47acd688933a1f615af3cd6f67135a..adffe418e9e533cde8b5d274b6a82ab509fa60a2 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -53,6 +53,15 @@ class IndexTag(Record): return key_builder.rec(key_hash, self.key) + @property + def key(self): + """Return a hashable, comparable value that is used to ensure + per-instruction uniqueness of this unique iname tag. + + Also used for persistent hash construction. + """ + return type(self).__name__ + class ParallelTag(IndexTag): pass @@ -63,9 +72,7 @@ class HardwareParallelTag(ParallelTag): class UniqueTag(IndexTag): - @property - def key(self): - return type(self).__name__ + pass class AxisTag(UniqueTag): @@ -107,10 +114,10 @@ class AutoFitLocalIndexTag(AutoLocalIndexTagBase): return "l.auto" +# {{{ ilp-like + class IlpBaseTag(ParallelTag): - @property - def key(self): - return type(self).__name__ + pass class UnrolledIlpTag(IlpBaseTag): @@ -122,24 +129,23 @@ class LoopedIlpTag(IlpBaseTag): def __str__(self): return "ilp.seq" +# }}} + + +class VectorizeTag(UniqueTag): + def __str__(self): + return "vec" + class UnrollTag(IndexTag): def __str__(self): return "unr" - @property - def key(self): - return type(self).__name__ - class ForceSequentialTag(IndexTag): def __str__(self): return "forceseq" - @property - def key(self): - return type(self).__name__ - def parse_tag(tag): if tag is None: @@ -155,6 +161,8 @@ def parse_tag(tag): return None elif tag in ["unr"]: return UnrollTag() + elif tag in ["vec"]: + return VectorizeTag() elif tag in ["ilp", "ilp.unr"]: return UnrolledIlpTag() elif tag == "ilp.seq": @@ -347,10 +355,10 @@ class TemporaryVariable(ArrayBase): from loopy.codegen import POD # uses the correct complex type from cgen.opencl import CLLocal - temp_var_decl = POD(target, self.dtype, self.name) + temp_var_decl = POD(target, dtype, self.name) # FIXME take into account storage_shape, or something like it - storage_shape = self.shape + storage_shape = shape if storage_shape: temp_var_decl = ArrayOf(temp_var_decl, diff --git a/loopy/padding.py b/loopy/padding.py index 4951c68543bc143f25a2c376b183d0f76e5d6c18..9aed22598517cb423d978e43f41e98dc8ab43cfd 100644 --- a/loopy/padding.py +++ b/loopy/padding.py @@ -41,7 +41,8 @@ class ArgAxisSplitHelper(RuleAwareIdentityMapper): return super(ArgAxisSplitHelper, self).map_subscript(expr, expn_state) -def split_arg_axis(kernel, args_and_axes, count, auto_split_inames=True): +def split_arg_axis(kernel, args_and_axes, count, auto_split_inames=True, + split_kwargs=None): """ :arg args_and_axes: a list of tuples *(arg, axis_nr)* indicating that the index in *axis_nr* should be split. The tuples may @@ -54,6 +55,7 @@ def split_arg_axis(kernel, args_and_axes, count, auto_split_inames=True): :arg count: The group size to use in the split. :arg auto_split_inames: Whether to automatically split inames encountered in the specified indices. + :arg split_kwargs: arguments to pass to :func:`loopy.split_inames` Note that splits on the corresponding inames are carried out implicitly. The inames may *not* be split beforehand. (There's no *really* good reason @@ -63,6 +65,9 @@ def split_arg_axis(kernel, args_and_axes, count, auto_split_inames=True): if count == 1: return kernel + if split_kwargs is None: + split_kwargs = {} + # {{{ process input into arg_to_rest # where "rest" is the non-argument-name part of the input tuples @@ -177,7 +182,7 @@ def split_arg_axis(kernel, args_and_axes, count, auto_split_inames=True): "beforehand? If so, you shouldn't.)" % (expr, axis_nr)) - split_iname = expr.index[axis_nr].name + split_iname = idx[axis_nr].name assert split_iname in kernel.all_inames() try: @@ -213,10 +218,12 @@ def split_arg_axis(kernel, args_and_axes, count, auto_split_inames=True): kernel = kernel.copy(args=new_args) - from loopy import split_iname - for iname, (outer_iname, inner_iname) in six.iteritems(split_vars): - kernel = split_iname(kernel, iname, count, - outer_iname=outer_iname, inner_iname=inner_iname) + if auto_split_inames: + from loopy import split_iname + for iname, (outer_iname, inner_iname) in six.iteritems(split_vars): + kernel = split_iname(kernel, iname, count, + outer_iname=outer_iname, inner_iname=inner_iname, + **split_kwargs) return kernel diff --git a/loopy/precompute.py b/loopy/precompute.py index 11b1396f15b4f9dc440ee75480a1d25fbc1e091a..ae973f98c1de87e2821575f3c65c03c989f696fc 100644 --- a/loopy/precompute.py +++ b/loopy/precompute.py @@ -1,4 +1,4 @@ -from __future__ import division, absolute_import +from __future__ import division, absolute_import, print_function import six from six.moves import range, zip @@ -25,6 +25,7 @@ THE SOFTWARE. """ +import islpy as isl from loopy.symbolic import (get_dependencies, RuleAwareIdentityMapper, RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) @@ -450,21 +451,8 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, if precompute_inames is not None: preexisting_precompute_inames = ( set(precompute_inames) & kernel.all_inames()) - - if ( - preexisting_precompute_inames - and - len(preexisting_precompute_inames) < len(precompute_inames)): - raise LoopyError("some (but not all) of the inames in " - "precompute_inames already exist. existing: %s non-existing: %s" - % ( - preexisting_precompute_inames, - set(precompute_inames) - preexisting_precompute_inames)) - - precompute_inames_already_exist = bool(preexisting_precompute_inames) - else: - precompute_inames_already_exist = False + preexisting_precompute_inames = set() # }}} @@ -483,20 +471,22 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, old_name = saxis name = "%s_%s" % (c_subst_name, old_name) - if precompute_inames is not None and i < len(precompute_inames): + if (precompute_inames is not None + and i < len(precompute_inames) + and precompute_inames[i]): name = precompute_inames[i] tag_lookup_saxis = name - if (not precompute_inames_already_exist + if (name not in preexisting_precompute_inames and var_name_gen.is_name_conflicting(name)): raise RuntimeError("new storage axis name '%s' " "conflicts with existing name" % name) - - if not precompute_inames_already_exist: + else: name = var_name_gen(name) storage_axis_names.append(name) - new_iname_to_tag[name] = storage_axis_to_tag.get( - tag_lookup_saxis, default_tag) + if name not in preexisting_precompute_inames: + new_iname_to_tag[name] = storage_axis_to_tag.get( + tag_lookup_saxis, default_tag) prior_storage_axis_name_dict[name] = old_name @@ -522,9 +512,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, if storage_axis_names: # {{{ find domain to be changed - change_inames = expanding_inames - if precompute_inames_already_exist: - change_inames = change_inames | preexisting_precompute_inames + change_inames = expanding_inames | preexisting_precompute_inames from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel, change_inames) @@ -551,40 +539,105 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, else: del new_iname_to_tag[saxis] - if not precompute_inames_already_exist: - new_kernel_domains = domch.get_domains_with( - abm.augment_domain_with_sweep( - domch.domain, non1_storage_axis_names, - boxify_sweep=fetch_bounding_box)) - else: - check_domain = domch.domain + if saxis in preexisting_precompute_inames: + raise LoopyError("precompute axis %d (1-based) was " + "eliminated as " + "having length 1 but also mapped to existing " + "iname '%s'" % (i+1, saxis)) + + mod_domain = domch.domain + + # {{{ modify the domain, taking into account preexisting inames + + # inames may already exist in mod_domain, add them primed to start + primed_non1_saxis_names = [ + iname+"'" for iname in non1_storage_axis_names] - # {{{ check the domain the preexisting inames' domain + mod_domain = abm.augment_domain_with_sweep( + domch.domain, primed_non1_saxis_names, + boxify_sweep=fetch_bounding_box) - # inames already exist in check_domain, add them primed - primed_non1_saxis_names = [ - iname+"'" for iname in non1_storage_axis_names] + check_domain = mod_domain + + for i, saxis in enumerate(non1_storage_axis_names): + var_dict = mod_domain.get_var_dict(isl.dim_type.set) + + if saxis in preexisting_precompute_inames: + # add equality constraint between existing and new variable + + dt, dim_idx = var_dict[saxis] + saxis_aff = isl.Aff.var_on_domain(mod_domain.space, dt, dim_idx) + + dt, dim_idx = var_dict[primed_non1_saxis_names[i]] + new_var_aff = isl.Aff.var_on_domain(mod_domain.space, dt, dim_idx) + + mod_domain = mod_domain.add_constraint( + isl.Constraint.inequality_from_aff(new_var_aff - saxis_aff)) + + # project out the new one + mod_domain = mod_domain.project_out(dt, dim_idx, 1) + + else: + # remove the prime from the new variable + dt, dim_idx = var_dict[primed_non1_saxis_names[i]] + mod_domain = mod_domain.set_dim_name(dt, dim_idx, saxis) - check_domain = abm.augment_domain_with_sweep( - check_domain, primed_non1_saxis_names, - boxify_sweep=fetch_bounding_box) + # {{{ check that we got the desired domain - # project out the original copies - from loopy.isl_helpers import project_out - check_domain = project_out(check_domain, non1_storage_axis_names) + check_domain = check_domain.project_out_except( + primed_non1_saxis_names, [isl.dim_type.set]) - for iname in non1_storage_axis_names: - var_dict = check_domain.get_var_dict() - dt, dim_idx = var_dict[iname+"'"] - check_domain = check_domain.set_dim_name(dt, dim_idx, iname) + mod_check_domain = mod_domain - if not (check_domain <= domch.domain and domch.domain <= check_domain): - raise LoopyError("domain of preexisting inames does not match " - "domain needed for precompute") + # re-add the prime from the new variable + var_dict = mod_check_domain.get_var_dict(isl.dim_type.set) - # }}} + for saxis in non1_storage_axis_names: + dt, dim_idx = var_dict[saxis] + mod_check_domain = mod_check_domain.set_dim_name(dt, dim_idx, saxis+"'") + + mod_check_domain = mod_check_domain.project_out_except( + primed_non1_saxis_names, [isl.dim_type.set]) + + mod_check_domain, check_domain = isl.align_two( + mod_check_domain, check_domain) + + # The modified domain can't get bigger by adding constraints + assert mod_check_domain <= check_domain + + if not check_domain <= mod_check_domain: + print(check_domain) + print(mod_check_domain) + raise LoopyError("domain of preexisting inames does not match " + "domain needed for precompute") + + # }}} + + # {{{ check that we didn't shrink the original domain + + # project out the new names from the modified domain + orig_domain_inames = list(domch.domain.get_var_dict(isl.dim_type.set)) + mod_check_domain = mod_domain.project_out_except( + orig_domain_inames, [isl.dim_type.set]) + + check_domain = domch.domain + + mod_check_domain, check_domain = isl.align_two( + mod_check_domain, check_domain) + + # The modified domain can't get bigger by adding constraints + assert mod_check_domain <= check_domain + + if not check_domain <= mod_check_domain: + print(check_domain) + print(mod_check_domain) + raise LoopyError("original domain got shrunk by applying the precompute") + + # }}} + + # }}} - new_kernel_domains = domch.get_domains_with(domch.domain) + new_kernel_domains = domch.get_domains_with(mod_domain) else: # leave kernel domains unchanged diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 9de31233b831215bba72ba1e226b0df555b42286..abd9b770ca5c449b1e774b4eccbe3bd314ff7b6a 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -519,7 +519,7 @@ def realize_reduction(kernel, insn_id_filter=None): # }}} -# {{{ duplicate private vars for ilp +# {{{ duplicate private vars for ilp and vec from loopy.symbolic import IdentityMapper @@ -550,12 +550,12 @@ class ExtraInameIndexInserter(IdentityMapper): return expr.index(new_idx) -def duplicate_private_temporaries_for_ilp(kernel): +def duplicate_private_temporaries_for_ilp_and_vec(kernel): logger.debug("%s: duplicate temporaries for ilp" % kernel.name) wmap = kernel.writer_map() - from loopy.kernel.data import IlpBaseTag + from loopy.kernel.data import IlpBaseTag, VectorizeTag var_to_new_ilp_inames = {} @@ -566,7 +566,9 @@ def duplicate_private_temporaries_for_ilp(kernel): writer_insn = kernel.id_to_insn[writer_insn_id] ilp_inames = frozenset(iname for iname in kernel.insn_inames(writer_insn) - if isinstance(kernel.iname_to_tag.get(iname), IlpBaseTag)) + if isinstance( + kernel.iname_to_tag.get(iname), + (IlpBaseTag, VectorizeTag))) referenced_ilp_inames = (ilp_inames & writer_insn.write_dependency_names()) @@ -618,10 +620,15 @@ def duplicate_private_temporaries_for_ilp(kernel): if shape is None: shape = () + dim_tags = ["c"] * (len(shape) + len(extra_shape)) + for i, iname in enumerate(inames): + if isinstance(kernel.iname_to_tag.get(iname), VectorizeTag): + dim_tags[len(shape) + i] = "vec" + new_temp_vars[tv.name] = tv.copy(shape=shape + extra_shape, # Forget what you knew about data layout, # create from scratch. - dim_tags=None) + dim_tags=dim_tags) # }}} @@ -1074,7 +1081,7 @@ def preprocess_kernel(kernel, device=None): # duplicate_private_temporaries_for_ilp because reduction accumulators # need to be duplicated by this. - kernel = duplicate_private_temporaries_for_ilp(kernel) + kernel = duplicate_private_temporaries_for_ilp_and_vec(kernel) kernel = mark_local_temporaries(kernel) kernel = assign_automatic_axes(kernel) kernel = find_boostability(kernel) diff --git a/loopy/schedule.py b/loopy/schedule.py index 6ef5378d6456ca022f9dd27ee87e593810981e98..298b6e513cd0f4f049809ea648c81c37bc0efd75 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -593,21 +593,37 @@ def generate_loop_schedules_internal(sched_state, loop_priority, schedule=[], useful_and_desired = useful_loops_set & loop_priority_set if useful_and_desired: - priority_tiers = [[iname] + priority_tiers = [ + [iname] for iname in loop_priority if iname in useful_and_desired - and iname not in sched_state.lowest_priority_inames] + and iname not in sched_state.ilp_inames + and iname not in sched_state.vec_inames + ] priority_tiers.append( useful_loops_set - loop_priority_set - - sched_state.lowest_priority_inames) + - sched_state.ilp_inames + - sched_state.vec_inames + ) else: - priority_tiers = [useful_loops_set - sched_state.lowest_priority_inames] + priority_tiers = [ + useful_loops_set + - sched_state.ilp_inames + - sched_state.vec_inames + ] + + # vectorization must be the absolute innermost loop + priority_tiers.extend([ + [iname] + for iname in sched_state.ilp_inames + if iname in useful_loops_set + ]) priority_tiers.extend([ [iname] - for iname in sched_state.lowest_priority_inames + for iname in sched_state.vec_inames if iname in useful_loops_set ]) @@ -1029,11 +1045,15 @@ def generate_loop_schedules(kernel, debug_args={}): debug = ScheduleDebugger(**debug_args) - from loopy.kernel.data import IlpBaseTag, ParallelTag + from loopy.kernel.data import IlpBaseTag, ParallelTag, VectorizeTag ilp_inames = set( iname for iname in kernel.all_inames() if isinstance(kernel.iname_to_tag.get(iname), IlpBaseTag)) + vec_inames = set( + iname + for iname in kernel.all_inames() + if isinstance(kernel.iname_to_tag.get(iname), VectorizeTag)) parallel_inames = set( iname for iname in kernel.all_inames() if isinstance(kernel.iname_to_tag.get(iname), ParallelTag)) @@ -1042,9 +1062,10 @@ def generate_loop_schedules(kernel, debug_args={}): kernel=kernel, loop_nest_map=loop_nest_map(kernel), breakable_inames=ilp_inames, - lowest_priority_inames=ilp_inames, - # ILP is not parallel for the purposes of the scheduler - parallel_inames=parallel_inames - ilp_inames) + ilp_inames=ilp_inames, + vec_inames=vec_inames, + # ilp and vec are not parallel for the purposes of the scheduler + parallel_inames=parallel_inames - ilp_inames - vec_inames) generators = [ generate_loop_schedules_internal(sched_state, loop_priority, diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 89b0d578b4871bc6d51177b6f9d6f6edd893db1a..28943644ec9fcf540257f519c6e338bc5e7d4806 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -54,11 +54,9 @@ class CTarget(TargetBase): def dtype_to_typename(self, dtype): return self.get_dtype_registry().dtype_to_ctype(dtype) - def get_expression_to_code_mapper(self, kernel, - seen_dtypes, seen_functions, allow_complex): + def get_expression_to_code_mapper(self, codegen_state): from loopy.target.c.codegen.expression import LoopyCCodeMapper - return (LoopyCCodeMapper(kernel, seen_dtypes, seen_functions, - allow_complex=allow_complex)) + return LoopyCCodeMapper(codegen_state) # {{{ code generation diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index d54640b5d8e277be32f9314db1453a05f9097a08..71844cfbf057c4533669dfbdce71e13f5e0a1ca3 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -49,52 +49,18 @@ def get_opencl_vec_member(idx): # {{{ C code mapper class LoopyCCodeMapper(RecursiveMapper): - def __init__(self, kernel, seen_dtypes, seen_functions, var_subst_map={}, - allow_complex=False): - """ - :arg seen_dtypes: set of dtypes that were encountered - :arg seen_functions: set of tuples (name, c_name, arg_types) indicating - functions that were encountered. - """ + def __init__(self, codegen_state): + self.kernel = codegen_state.kernel + self.codegen_state = codegen_state - self.kernel = kernel - self.seen_dtypes = seen_dtypes - self.seen_functions = seen_functions - - self.type_inf_mapper = TypeInferenceMapper(kernel) - self.allow_complex = allow_complex - - self.var_subst_map = var_subst_map.copy() - - # {{{ copy helpers - - def copy(self, var_subst_map=None): - if var_subst_map is None: - var_subst_map = self.var_subst_map - return LoopyCCodeMapper(self.kernel, self.seen_dtypes, self.seen_functions, - var_subst_map=var_subst_map, - allow_complex=self.allow_complex) - - def copy_and_assign(self, name, value): - """Make a copy of self with variable *name* fixed to *value*.""" - var_subst_map = self.var_subst_map.copy() - var_subst_map[name] = value - return self.copy(var_subst_map=var_subst_map) - - def copy_and_assign_many(self, assignments): - """Make a copy of self with *assignments* included.""" - - var_subst_map = self.var_subst_map.copy() - var_subst_map.update(assignments) - return self.copy(var_subst_map=var_subst_map) - - # }}} + self.type_inf_mapper = TypeInferenceMapper(self.kernel) + self.allow_complex = codegen_state.allow_complex # {{{ helpers def infer_type(self, expr): result = self.type_inf_mapper(expr) - self.seen_dtypes.add(result) + self.codegen_state.seen_dtypes.add(result) return result def join_rec(self, joiner, iterable, prec, type_context, needed_dtype=None): @@ -133,14 +99,14 @@ class LoopyCCodeMapper(RecursiveMapper): "entry to loopy") def map_variable(self, expr, enclosing_prec, type_context): - if expr.name in self.var_subst_map: + if expr.name in self.codegen_state.var_subst_map: if self.kernel.options.annotate_inames: return " /* %s */ %s" % ( expr.name, - self.rec(self.var_subst_map[expr.name], + self.rec(self.codegen_state.var_subst_map[expr.name], enclosing_prec, type_context)) else: - return str(self.rec(self.var_subst_map[expr.name], + return str(self.rec(self.codegen_state.var_subst_map[expr.name], enclosing_prec, type_context)) elif expr.name in self.kernel.arg_dict: arg = self.kernel.arg_dict[expr.name] @@ -201,7 +167,8 @@ class LoopyCCodeMapper(RecursiveMapper): from pymbolic import evaluate access_info = get_access_info(self.kernel.target, ary, expr.index, - lambda expr: evaluate(expr, self.var_subst_map)) + lambda expr: evaluate(expr, self.codegen_state.var_subst_map), + self.codegen_state.vectorization_info) vec_member = get_opencl_vec_member(access_info.vector_index) @@ -310,7 +277,8 @@ class LoopyCCodeMapper(RecursiveMapper): def seen_func(name): idt = self.kernel.index_dtype from loopy.codegen import SeenFunction - self.seen_functions.add(SeenFunction(name, name, (idt, idt))) + self.codegen_state.seen_functions.add( + SeenFunction(name, name, (idt, idt))) if den_nonneg: if num_nonneg: @@ -432,7 +400,8 @@ class LoopyCCodeMapper(RecursiveMapper): % identifier) from loopy.codegen import SeenFunction - self.seen_functions.add(SeenFunction(identifier, c_name, par_dtypes)) + self.codegen_state.seen_functions.add( + SeenFunction(identifier, c_name, par_dtypes)) if str_parameters is None: # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to # propagate the type context here. But for many others, it does diff --git a/requirements.txt b/requirements.txt index f4ce2c74cd4449717346ad04a3e2c105cad2f4d9..5a5f17baad3017bdf37c98fd7cd1d74d026e1f7a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ git+git://github.com/inducer/pytools git+git://github.com/inducer/islpy cgen -git+git://github.com/pyopencl/pyopencl@cffi +git+git://github.com/pyopencl/pyopencl git+git://github.com/inducer/pymbolic hg+https://bitbucket.org/inducer/f2py diff --git a/test/test_fortran.py b/test/test_fortran.py index 4117b80a27b243dee1db94b5a0bb2b83b2ec8d49..c31c370076b681cb0593f38b6a4d92479541b872 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -453,6 +453,49 @@ def test_parse_and_fuse_two_kernels(): knl, = lp.parse_transformed_fortran(fortran_src) +def test_precompute_some_exist(ctx_factory): + fortran_src = """ + subroutine dgemm(m,n,l,a,b,c) + implicit none + real*8 a(m,l),b(l,n),c(m,n) + integer m,n,k,i,j,l + + do j = 1,n + do i = 1,m + do k = 1,l + c(i,j) = c(i,j) + b(k,j)*a(i,k) + end do + end do + end do + end subroutine + """ + + knl, = lp.parse_fortran(fortran_src) + + assert len(knl.domains) == 1 + + knl = lp.split_iname(knl, "i", 8, + outer_tag="g.0", inner_tag="l.1") + knl = lp.split_iname(knl, "j", 8, + outer_tag="g.1", inner_tag="l.0") + knl = lp.split_iname(knl, "k", 8) + knl = lp.assume(knl, "n mod 8 = 0") + knl = lp.assume(knl, "m mod 8 = 0") + knl = lp.assume(knl, "l mod 8 = 0") + + knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") + knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") + knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", + precompute_inames="ktemp,itemp") + knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", + precompute_inames="itemp,k2temp") + + ref_knl = knl + + ctx = ctx_factory() + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128)) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_loopy.py b/test/test_loopy.py index 1527e4ff710871f4ee7cddde95ad2016d78969a2..22c0ce47c8cfc2aea051b63d5f50603f0d406f70 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1,7 +1,4 @@ -from __future__ import division -from __future__ import absolute_import -import six -from six.moves import range +from __future__ import division, absolute_import, print_function __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -25,6 +22,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import six +from six.moves import range import sys import numpy as np @@ -181,7 +180,7 @@ def test_simple_side_effect(ctx_factory): for gen_knl in kernel_gen: print(gen_knl) compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.code) + print(compiled.get_code()) def test_nonsense_reduction(ctx_factory): @@ -218,7 +217,7 @@ def test_owed_barriers(ctx_factory): for gen_knl in kernel_gen: compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.code) + print(compiled.get_code()) def test_wg_too_small(ctx_factory): @@ -312,7 +311,7 @@ def test_multi_cse(ctx_factory): for gen_knl in kernel_gen: compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.code) + print(compiled.get_code()) def test_stencil(ctx_factory): @@ -356,7 +355,7 @@ def test_stencil(ctx_factory): for variant in [variant_1, variant_2]: lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), - fills_entire_output=False, print_ref_code=False, + print_ref_code=False, op_count=[n*n], op_label=["cells"]) @@ -397,7 +396,7 @@ def test_stencil_with_overfetch(ctx_factory): for variant in [variant_overfetch]: n = 200 lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), - fills_entire_output=False, print_ref_code=False, + print_ref_code=False, op_count=[n*n], parameters=dict(n=n), op_label=["cells"]) @@ -553,7 +552,7 @@ def test_fuzz_code_generator(ctx_factory): print("true=%r" % true_value) print("loopy=%r" % lp_value) print(80*"-") - print(ck.code) + print(ck.get_code()) print(80*"-") print(var_values) print(80*"-") @@ -643,7 +642,7 @@ def test_multi_nested_dependent_reduction(ctx_factory): assumptions="ntgts>=1") cknl = lp.CompiledKernel(ctx, knl) - print(cknl.code) + print(cknl.get_code()) # FIXME: Actually test functionality. @@ -871,8 +870,7 @@ def test_equality_constraints(ctx_factory): #print(knl.domains[0].detect_equalities()) lp.auto_test_vs_ref(seq_knl, ctx, knl, - parameters=dict(n=n), print_ref_code=True, - fills_entire_output=False) + parameters=dict(n=n), print_ref_code=True) def test_stride(ctx_factory): @@ -898,7 +896,7 @@ def test_stride(ctx_factory): seq_knl = knl lp.auto_test_vs_ref(seq_knl, ctx, knl, - parameters=dict(n=n), fills_entire_output=False) + parameters=dict(n=n)) def test_domain_dependency_via_existentially_quantified_variable(ctx_factory): @@ -926,8 +924,7 @@ def test_domain_dependency_via_existentially_quantified_variable(ctx_factory): seq_knl = knl lp.auto_test_vs_ref(seq_knl, ctx, knl, - parameters=dict(n=n), - fills_entire_output=False) + parameters=dict(n=n)) def test_double_sum(ctx_factory): @@ -1581,8 +1578,7 @@ def test_vector_types(ctx_factory, vec_len): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict( n=20000 - ), - fills_entire_output=False) + )) def test_tag_data_axes(ctx_factory): @@ -2064,6 +2060,33 @@ def test_precompute_with_preexisting_inames_fail(): precompute_inames="ii,jj") +def test_vectorize(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i]: 0<=i<n}", + """ + <> temp = 2*b[i] + a[i] = temp + """) + knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) + knl = lp.split_arg_axis(knl, [("a", 0), ("b", 0)], 4, + split_kwargs=dict(slabs=(0, 1))) + + knl = lp.tag_data_axes(knl, "a,b", "c,vec") + ref_knl = knl + ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) + + knl = lp.tag_inames(knl, {"i_inner": "vec"}) + + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + code, inf = lp.generate_code(knl) + + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(n=30)) + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])