From 22d6ad35266bce2919df5c79fa92d335852fa44e Mon Sep 17 00:00:00 2001 From: Dominic Kempf Date: Mon, 27 Mar 2017 14:51:26 +0200 Subject: [PATCH 1/5] First steps towards implementing vector iname reductions This commit * adds a callback for reductions of vec-tagged inames * removes the logic "instruction generated => reduction was realized" because vector reductions do not necessarily write new instructions * adds emit_vector_reduction to ASTBuilderBase. By default, it throws Unvectorizable. --- loopy/preprocess.py | 166 ++++++++++++++++++++++----------------- loopy/target/__init__.py | 7 +- 2 files changed, 102 insertions(+), 71 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 0d8e77195..ba45d192c 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -37,6 +37,7 @@ from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types +from loopy.codegen import Unvectorizable import logging logger = logging.getLogger(__name__) @@ -548,6 +549,15 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars] # }}} + # {{{ reduction of vector iname + + def map_reduction_vector(expr, rec, nresults, arg_dtype, reduction_dtypes): + builder = kernel.target.get_device_ast_builder() + return builder.emit_vector_reduction(expr) + + # }}} + + # {{{ seq/par dispatch def map_reduction(expr, rec, nresults=1): @@ -581,6 +591,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): n_sequential = 0 n_local_par = 0 + n_vec = 0 from loopy.kernel.data import ( LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, @@ -596,9 +607,12 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): elif isinstance(iname_tag, LocalIndexTagBase): n_local_par += 1 - elif isinstance(iname_tag, (ParallelTag, VectorizeTag)): - raise LoopyError("the only form of parallelism supported " - "by reductions is 'local'--found iname '%s' " + elif isinstance(iname_tag, VectorizeTag): + n_vec += 1 + + elif isinstance(iname_tag, ParallelTag): + raise LoopyError("the only forms of parallelism supported " + "by reductions is 'local' and 'vec'--found iname '%s' " "tagged '%s'" % (iname, type(iname_tag).__name__)) @@ -619,6 +633,20 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): "before code generation." % ", ".join(expr.inames)) + if n_vec > 1: + raise LoopyError("Reduction over '%s' contains more than one iname" + "tagged as vectorized." % ", ".join(expr.inames)) + + if n_vec: + try: + #TODO this one should go away for sure + assert n_sequential == 0 + return map_reduction_vector(expr, rec, nresults, arg_dtype, + reduction_dtypes) + except Unvectorizable: + # Fall back to implementing this as a sequential reduction + n_sequential += 1 + if n_sequential: assert n_local_par == 0 return map_reduction_seq(expr, rec, nresults, arg_dtype, @@ -662,82 +690,80 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): nresults = len(insn.assignees) - # Run reduction expansion. - from loopy.symbolic import Reduction - if isinstance(insn.expression, Reduction) and nresults > 1: - new_expressions = cb_mapper(insn.expression, nresults=nresults) - else: - new_expressions = (cb_mapper(insn.expression),) + # Do the expansion of reductions + new_expressions = cb_mapper(insn.expression, nresults=nresults) - if generated_insns: - # An expansion happened, so insert the generated stuff plus - # ourselves back into the queue. + # Find out whether we actually expanded something and process + # the results if so + if new_expressions == insn.expression: + assert not new_insn_add_depends_on + new_insns.append(insn) + continue - kwargs = insn.get_copy_kwargs( - depends_on=insn.depends_on - | frozenset(new_insn_add_depends_on), - no_sync_with=insn.no_sync_with - | frozenset(new_insn_add_no_sync_with), - within_inames=( - temp_kernel.insn_inames(insn) - | new_insn_add_within_inames)) - - kwargs.pop("id") - kwargs.pop("expression") - kwargs.pop("assignee", None) - kwargs.pop("assignees", None) - kwargs.pop("temp_var_type", None) - kwargs.pop("temp_var_types", None) - - if isinstance(insn.expression, Reduction) and nresults > 1: - replacement_insns = [ - lp.Assignment( - id=insn_id_gen(insn.id), - assignee=assignee, - expression=new_expr, - **kwargs) - for assignee, new_expr in zip( - insn.assignees, new_expressions)] + if not isinstance(new_expressions, tuple): + new_expressions = (new_expressions,) - else: - new_expr, = new_expressions - replacement_insns = [ - make_assignment( - id=insn_id_gen(insn.id), - assignees=insn.assignees, - expression=new_expr, - **kwargs) - ] - - insn_id_replacements[insn.id] = [ - rinsn.id for rinsn in replacement_insns] - - insn_queue = generated_insns + replacement_insns + insn_queue - - # The reduction expander needs an up-to-date kernel - # object to find dependencies. Keep temp_kernel up-to-date. - - temp_kernel = kernel.copy( - instructions=new_insns + insn_queue, - temporary_variables=new_temporary_variables, - domains=domains) - temp_kernel = lp.replace_instruction_ids( - temp_kernel, insn_id_replacements) + # An expansion happened, so insert the generated stuff plus + # ourselves back into the queue. - else: - # nothing happened, we're done with insn - assert not new_insn_add_depends_on + kwargs = insn.get_copy_kwargs( + depends_on=insn.depends_on + | frozenset(new_insn_add_depends_on), + no_sync_with=insn.no_sync_with + | frozenset(new_insn_add_no_sync_with), + within_inames=( + temp_kernel.insn_inames(insn) + | new_insn_add_within_inames)) - new_insns.append(insn) + kwargs.pop("id") + kwargs.pop("expression") + kwargs.pop("assignee", None) + kwargs.pop("assignees", None) + kwargs.pop("temp_var_type", None) + kwargs.pop("temp_var_types", None) + + from loopy.symbolic import Reduction + if isinstance(insn.expression, Reduction) and nresults > 1: + replacement_insns = [ + lp.Assignment( + id=insn_id_gen(insn.id), + assignee=assignee, + expression=new_expr, + **kwargs) + for assignee, new_expr in zip( + insn.assignees, new_expressions)] + else: + new_expr, = new_expressions + replacement_insns = [ + make_assignment( + id=insn_id_gen(insn.id), + assignees=insn.assignees, + expression=new_expr, + **kwargs) + ] + + insn_id_replacements[insn.id] = [ + rinsn.id for rinsn in replacement_insns] + + insn_queue = generated_insns + replacement_insns + insn_queue + + # The reduction expander needs an up-to-date kernel + # object to find dependencies. Keep temp_kernel up-to-date. + + temp_kernel = kernel.copy( + instructions=new_insns + insn_queue, + temporary_variables=new_temporary_variables, + domains=domains) + temp_kernel = lp.replace_instruction_ids( + temp_kernel, insn_id_replacements) kernel = kernel.copy( - instructions=new_insns, - temporary_variables=new_temporary_variables, - domains=domains) + instructions=new_insns, + temporary_variables=new_temporary_variables, + domains=domains) kernel = lp.replace_instruction_ids(kernel, insn_id_replacements) - - kernel = lp.tag_inames(kernel, new_iname_tags) + kernel = lp.tag_inames(temp_kernel, new_iname_tags) return kernel diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 5d5743bae..69e6020ab 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -41,6 +41,8 @@ __doc__ = """ """ +from loopy.codegen import Unvectorizable + class TargetBase(object): """Base class for all targets, i.e. different combinations of code that @@ -103,7 +105,7 @@ class TargetBase(object): def get_device_ast_builder(self): """ - :returns: a class implementing :class:`ASTBuilderBase` for the host code + :returns: a class implementing :class:`ASTBuilderBase` for the device code """ raise NotImplementedError() @@ -210,6 +212,9 @@ class ASTBuilderBase(object): static_lbound, static_ubound, inner): raise NotImplementedError() + def emit_vector_reduction(self, expr): + raise Unvectorizable() + def emit_if(self, condition_str, ast): raise NotImplementedError() -- GitLab From 193ed2b30640d6b5a8d0a76ad94f8c42b1db3e5e Mon Sep 17 00:00:00 2001 From: Dominic Kempf Date: Mon, 27 Mar 2017 15:17:54 +0200 Subject: [PATCH 2/5] [bugfix] Unify return type of reduction callbacks to tuple All the others use it already, and we want to use hashability. --- loopy/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index ba45d192c..7ca6dbbb8 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -546,7 +546,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): assert len(acc_vars) == 1 return acc_vars[0][outer_local_iname_vars + (0,)] else: - return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars] + return tuple(acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars) # }}} # {{{ reduction of vector iname -- GitLab From 079764f90f1840841ae563c20b95b12a2e93fccb Mon Sep 17 00:00:00 2001 From: Dominic Kempf Date: Mon, 27 Mar 2017 15:35:10 +0200 Subject: [PATCH 3/5] [bugfix] Do not pass nresults into the callback, if == 1 It cant handle it... --- loopy/preprocess.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 7ca6dbbb8..3cf703002 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -546,7 +546,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): assert len(acc_vars) == 1 return acc_vars[0][outer_local_iname_vars + (0,)] else: - return tuple(acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars) + return tuple(acc_var[outer_local_iname_vars + (0,)] + for acc_var in acc_vars) # }}} # {{{ reduction of vector iname @@ -557,7 +558,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): # }}} - # {{{ seq/par dispatch def map_reduction(expr, rec, nresults=1): @@ -691,21 +691,21 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): nresults = len(insn.assignees) # Do the expansion of reductions - new_expressions = cb_mapper(insn.expression, nresults=nresults) - - # Find out whether we actually expanded something and process - # the results if so - if new_expressions == insn.expression: - assert not new_insn_add_depends_on - new_insns.append(insn) - continue + from loopy.symbolic import Reduction + if isinstance(insn.expression, Reduction) and nresults > 1: + new_expressions = cb_mapper(insn.expression, nresults=nresults) + else: + new_expressions = (cb_mapper(insn.expression),) - if not isinstance(new_expressions, tuple): - new_expressions = (new_expressions,) + # Find out whether we actually expanded something. + # If not, we save this insn for the final kernel and continue + if new_expressions == (insn.expression,): + assert not new_insn_add_depends_on + new_insns.append(insn) + continue # An expansion happened, so insert the generated stuff plus # ourselves back into the queue. - kwargs = insn.get_copy_kwargs( depends_on=insn.depends_on | frozenset(new_insn_add_depends_on), @@ -722,7 +722,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): kwargs.pop("temp_var_type", None) kwargs.pop("temp_var_types", None) - from loopy.symbolic import Reduction if isinstance(insn.expression, Reduction) and nresults > 1: replacement_insns = [ lp.Assignment( -- GitLab From a02f04cc334c889f0bac3a94f6b4c438934aa3d2 Mon Sep 17 00:00:00 2001 From: Dominic Kempf Date: Tue, 28 Mar 2017 13:25:58 +0200 Subject: [PATCH 4/5] Implement an unrolled fallback for vector reductions --- loopy/preprocess.py | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 3cf703002..9acde7e9f 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -553,9 +553,35 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): # {{{ reduction of vector iname def map_reduction_vector(expr, rec, nresults, arg_dtype, reduction_dtypes): - builder = kernel.target.get_device_ast_builder() - return builder.emit_vector_reduction(expr) + try: + builder = kernel.target.get_device_ast_builder() + return builder.emit_vector_reduction(expr) + except Unvectorizable: + # Provide an unrolled fallback + from loopy.kernel.data import VectorizeTag + iname, = tuple(i for i in expr.inames + if isinstance(kernel.iname_to_tag[i], VectorizeTag)) + + # Extract the vector length from the domain + bound = kernel.get_iname_bounds(iname, constants_only=True) + from loopy.isl_helpers import static_max_of_pw_aff + from loopy.symbolic import pw_aff_to_expr + length_aff = static_max_of_pw_aff(bound.size, constants_only=True) + length = int(pw_aff_to_expr(length_aff)) + + # The below code should return a neutral element, if the loop domain + # is empty - can that actually happen? + assert length > 0 + + # Unroll the reduction + from pymbolic import substitute, var + result = substitute(expr.expr, variable_assignments={var(iname): 0}) + for idx in range(1, length): + newi = substitute(expr.expr, variable_assignments={var(iname): idx}) + result = expr.operation(arg_dtype, result, newi, + set(expr.inames) - set([iname])) + return result # }}} # {{{ seq/par dispatch @@ -638,14 +664,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): "tagged as vectorized." % ", ".join(expr.inames)) if n_vec: - try: - #TODO this one should go away for sure - assert n_sequential == 0 - return map_reduction_vector(expr, rec, nresults, arg_dtype, - reduction_dtypes) - except Unvectorizable: - # Fall back to implementing this as a sequential reduction - n_sequential += 1 + return map_reduction_vector(expr, rec, nresults, arg_dtype, + reduction_dtypes) if n_sequential: assert n_local_par == 0 -- GitLab From 58ea2c5cf95fd33b906cd0b365144d156f647a93 Mon Sep 17 00:00:00 2001 From: Dominic Kempf Date: Tue, 28 Mar 2017 13:39:20 +0200 Subject: [PATCH 5/5] Add a test for vector reduction fallback --- test/test_reduction.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/test/test_reduction.py b/test/test_reduction.py index 86e72c0c6..26e9a7a47 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -405,6 +405,29 @@ def test_parallel_multi_output_reduction(): # TODO: Add functional test +def test_vector_reduction_fallback(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i]: 0<=i<4}", + """ + b = sum(i, a[i]) + """, + [lp.GlobalArg("a", np.float32, shape=(4,), dim_tags=("vec",)), + lp.GlobalArg("b", np.float32, shape=())] + ) + + knl = lp.tag_inames(knl, [("i", "vec")]) + knl = lp.preprocess_kernel(knl) + print(knl) + + a = np.random.randn(4) + ref = sum(a) + evt, (a, b) = knl(queue, a=a) + assert b.get() == ref + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab