diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 0d8e771954cf26cc11747e745946389420fa5e1b..9acde7e9f0ed70f6f336d7d12cbf39c1afb63f85 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -37,6 +37,7 @@ from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types +from loopy.codegen import Unvectorizable import logging logger = logging.getLogger(__name__) @@ -545,7 +546,42 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): assert len(acc_vars) == 1 return acc_vars[0][outer_local_iname_vars + (0,)] else: - return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars] + return tuple(acc_var[outer_local_iname_vars + (0,)] + for acc_var in acc_vars) + # }}} + + # {{{ reduction of vector iname + + def map_reduction_vector(expr, rec, nresults, arg_dtype, reduction_dtypes): + try: + builder = kernel.target.get_device_ast_builder() + return builder.emit_vector_reduction(expr) + except Unvectorizable: + # Provide an unrolled fallback + from loopy.kernel.data import VectorizeTag + iname, = tuple(i for i in expr.inames + if isinstance(kernel.iname_to_tag[i], VectorizeTag)) + + # Extract the vector length from the domain + bound = kernel.get_iname_bounds(iname, constants_only=True) + from loopy.isl_helpers import static_max_of_pw_aff + from loopy.symbolic import pw_aff_to_expr + length_aff = static_max_of_pw_aff(bound.size, constants_only=True) + length = int(pw_aff_to_expr(length_aff)) + + # The below code should return a neutral element, if the loop domain + # is empty - can that actually happen? + assert length > 0 + + # Unroll the reduction + from pymbolic import substitute, var + result = substitute(expr.expr, variable_assignments={var(iname): 0}) + for idx in range(1, length): + newi = substitute(expr.expr, variable_assignments={var(iname): idx}) + result = expr.operation(arg_dtype, result, newi, + set(expr.inames) - set([iname])) + + return result # }}} # {{{ seq/par dispatch @@ -581,6 +617,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): n_sequential = 0 n_local_par = 0 + n_vec = 0 from loopy.kernel.data import ( LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, @@ -596,9 +633,12 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): elif isinstance(iname_tag, LocalIndexTagBase): n_local_par += 1 - elif isinstance(iname_tag, (ParallelTag, VectorizeTag)): - raise LoopyError("the only form of parallelism supported " - "by reductions is 'local'--found iname '%s' " + elif isinstance(iname_tag, VectorizeTag): + n_vec += 1 + + elif isinstance(iname_tag, ParallelTag): + raise LoopyError("the only forms of parallelism supported " + "by reductions is 'local' and 'vec'--found iname '%s' " "tagged '%s'" % (iname, type(iname_tag).__name__)) @@ -619,6 +659,14 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): "before code generation." % ", ".join(expr.inames)) + if n_vec > 1: + raise LoopyError("Reduction over '%s' contains more than one iname" + "tagged as vectorized." % ", ".join(expr.inames)) + + if n_vec: + return map_reduction_vector(expr, rec, nresults, arg_dtype, + reduction_dtypes) + if n_sequential: assert n_local_par == 0 return map_reduction_seq(expr, rec, nresults, arg_dtype, @@ -662,82 +710,79 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): nresults = len(insn.assignees) - # Run reduction expansion. + # Do the expansion of reductions from loopy.symbolic import Reduction if isinstance(insn.expression, Reduction) and nresults > 1: new_expressions = cb_mapper(insn.expression, nresults=nresults) else: new_expressions = (cb_mapper(insn.expression),) - if generated_insns: - # An expansion happened, so insert the generated stuff plus - # ourselves back into the queue. - - kwargs = insn.get_copy_kwargs( - depends_on=insn.depends_on - | frozenset(new_insn_add_depends_on), - no_sync_with=insn.no_sync_with - | frozenset(new_insn_add_no_sync_with), - within_inames=( - temp_kernel.insn_inames(insn) - | new_insn_add_within_inames)) - - kwargs.pop("id") - kwargs.pop("expression") - kwargs.pop("assignee", None) - kwargs.pop("assignees", None) - kwargs.pop("temp_var_type", None) - kwargs.pop("temp_var_types", None) - - if isinstance(insn.expression, Reduction) and nresults > 1: - replacement_insns = [ - lp.Assignment( - id=insn_id_gen(insn.id), - assignee=assignee, - expression=new_expr, - **kwargs) - for assignee, new_expr in zip( - insn.assignees, new_expressions)] + # Find out whether we actually expanded something. + # If not, we save this insn for the final kernel and continue + if new_expressions == (insn.expression,): + assert not new_insn_add_depends_on + new_insns.append(insn) + continue + + # An expansion happened, so insert the generated stuff plus + # ourselves back into the queue. + kwargs = insn.get_copy_kwargs( + depends_on=insn.depends_on + | frozenset(new_insn_add_depends_on), + no_sync_with=insn.no_sync_with + | frozenset(new_insn_add_no_sync_with), + within_inames=( + temp_kernel.insn_inames(insn) + | new_insn_add_within_inames)) - else: - new_expr, = new_expressions - replacement_insns = [ - make_assignment( - id=insn_id_gen(insn.id), - assignees=insn.assignees, - expression=new_expr, - **kwargs) - ] - - insn_id_replacements[insn.id] = [ - rinsn.id for rinsn in replacement_insns] - - insn_queue = generated_insns + replacement_insns + insn_queue - - # The reduction expander needs an up-to-date kernel - # object to find dependencies. Keep temp_kernel up-to-date. - - temp_kernel = kernel.copy( - instructions=new_insns + insn_queue, - temporary_variables=new_temporary_variables, - domains=domains) - temp_kernel = lp.replace_instruction_ids( - temp_kernel, insn_id_replacements) + kwargs.pop("id") + kwargs.pop("expression") + kwargs.pop("assignee", None) + kwargs.pop("assignees", None) + kwargs.pop("temp_var_type", None) + kwargs.pop("temp_var_types", None) + if isinstance(insn.expression, Reduction) and nresults > 1: + replacement_insns = [ + lp.Assignment( + id=insn_id_gen(insn.id), + assignee=assignee, + expression=new_expr, + **kwargs) + for assignee, new_expr in zip( + insn.assignees, new_expressions)] else: - # nothing happened, we're done with insn - assert not new_insn_add_depends_on - - new_insns.append(insn) + new_expr, = new_expressions + replacement_insns = [ + make_assignment( + id=insn_id_gen(insn.id), + assignees=insn.assignees, + expression=new_expr, + **kwargs) + ] + + insn_id_replacements[insn.id] = [ + rinsn.id for rinsn in replacement_insns] + + insn_queue = generated_insns + replacement_insns + insn_queue + + # The reduction expander needs an up-to-date kernel + # object to find dependencies. Keep temp_kernel up-to-date. + + temp_kernel = kernel.copy( + instructions=new_insns + insn_queue, + temporary_variables=new_temporary_variables, + domains=domains) + temp_kernel = lp.replace_instruction_ids( + temp_kernel, insn_id_replacements) kernel = kernel.copy( - instructions=new_insns, - temporary_variables=new_temporary_variables, - domains=domains) + instructions=new_insns, + temporary_variables=new_temporary_variables, + domains=domains) kernel = lp.replace_instruction_ids(kernel, insn_id_replacements) - - kernel = lp.tag_inames(kernel, new_iname_tags) + kernel = lp.tag_inames(temp_kernel, new_iname_tags) return kernel diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 5d5743bae322fc59c989cafd85122c8ca619c422..69e6020abcdaf4c446f36f48f2dbb40313b2d6b7 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -41,6 +41,8 @@ __doc__ = """ """ +from loopy.codegen import Unvectorizable + class TargetBase(object): """Base class for all targets, i.e. different combinations of code that @@ -103,7 +105,7 @@ class TargetBase(object): def get_device_ast_builder(self): """ - :returns: a class implementing :class:`ASTBuilderBase` for the host code + :returns: a class implementing :class:`ASTBuilderBase` for the device code """ raise NotImplementedError() @@ -210,6 +212,9 @@ class ASTBuilderBase(object): static_lbound, static_ubound, inner): raise NotImplementedError() + def emit_vector_reduction(self, expr): + raise Unvectorizable() + def emit_if(self, condition_str, ast): raise NotImplementedError() diff --git a/test/test_reduction.py b/test/test_reduction.py index 86e72c0c6644b7b9837a6d74da756c58344b1d6f..26e9a7a47c12f0e4a441ade883d4bd786877a739 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -405,6 +405,29 @@ def test_parallel_multi_output_reduction(): # TODO: Add functional test +def test_vector_reduction_fallback(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i]: 0<=i<4}", + """ + b = sum(i, a[i]) + """, + [lp.GlobalArg("a", np.float32, shape=(4,), dim_tags=("vec",)), + lp.GlobalArg("b", np.float32, shape=())] + ) + + knl = lp.tag_inames(knl, [("i", "vec")]) + knl = lp.preprocess_kernel(knl) + print(knl) + + a = np.random.randn(4) + ref = sum(a) + evt, (a, b) = knl(queue, a=a) + assert b.get() == ref + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])