diff --git a/loopy/__init__.py b/loopy/__init__.py index 6bd764f8df93f1b4b2ae5755c1c90ccddc654fe6..6cbb3362ef91b27c3b7b1cf6a591f7f9a20c2f7a 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -457,8 +457,8 @@ def _set_up_default_target(): set_default_target(target) -_set_up_default_target() +_set_up_default_target() # }}} diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 8ac963835ec12702f2010806d1d49062422318a2..fd36a5bee2bbe88c3e763410271620640c847778 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -148,8 +148,14 @@ def generate_unroll_loop(codegen_state, sched_index): for i in range(length): idx_aff = lower_bound_aff + i new_codegen_state = codegen_state.fix(iname, idx_aff) - result.append( - build_loop_nest(new_codegen_state, sched_index+1)) + inner = build_loop_nest(new_codegen_state, sched_index+1) + inner = inner.with_new_ast( + new_codegen_state, + codegen_state.ast_builder.emit_scope( + new_codegen_state, + (iname,), + inner.current_ast(new_codegen_state))) + result.append(inner) return merge_codegen_results(codegen_state, result) @@ -236,17 +242,29 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, from loopy.schedule import get_insn_ids_for_block_at insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index) - if hw_inames_left is None: - all_inames_by_insns = set() + def get_all_inames_by_insns(): + result = set() for insn_id in insn_ids_for_block: - all_inames_by_insns |= kernel.insn_inames(insn_id) + result |= kernel.insn_inames(insn_id) + return result + if hw_inames_left is None: hw_inames_left = [iname - for iname in all_inames_by_insns + for iname in get_all_inames_by_insns() if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)] if not hw_inames_left: - return next_func(codegen_state) + hw_inames = tuple(iname + for iname in get_all_inames_by_insns() + if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)) + + inner = next_func(codegen_state) + return inner.with_new_ast( + codegen_state, + codegen_state.ast_builder.emit_scope( + codegen_state, + hw_inames, + inner.current_ast(codegen_state))) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( insn_ids_for_block) @@ -423,6 +441,12 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): kernel, slab, iname))) inner = build_loop_nest(new_codegen_state, sched_index+1) + inner = inner.with_new_ast( + new_codegen_state, + codegen_state.ast_builder.emit_scope( + new_codegen_state, + (loop_iname,), + inner.current_ast(new_codegen_state))) # }}} diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index c683d120dbf6a1205618f8835e8f9c72dd13adf7..df43159f410b69c734b053268b3dc62a2a983840 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -300,7 +300,7 @@ def generate_host_or_device_program(codegen_state, schedule_index): collapse=False) cur_prog = codegen_result.current_program(codegen_state) - body_ast = cur_prog.ast + body_ast = ast_builder.process_ast(codegen_state, cur_prog.ast) fdecl_ast = ast_builder.get_function_declaration( codegen_state, codegen_result, schedule_index) @@ -311,8 +311,8 @@ def generate_host_or_device_program(codegen_state, schedule_index): codegen_result = codegen_result.with_new_program( codegen_state, cur_prog.copy( - ast=ast_builder.process_ast(fdef_ast), - body_ast=ast_builder.process_ast(body_ast))) + ast=fdef_ast, + body_ast=body_ast)) return codegen_result diff --git a/loopy/options.py b/loopy/options.py index c88c512cb332ceec4587fd3c5011b9f729cad7d5..cfafd443876f8f76de480340e123a8dc5a39b460 100644 --- a/loopy/options.py +++ b/loopy/options.py @@ -70,6 +70,8 @@ class Options(Record): Unless otherwise specified, these options are Boolean-valued (i.e. on/off). + .. ------------------------------------------------------------------------ + .. rubric:: Code-generation options .. attribute:: annotate_inames @@ -95,6 +97,16 @@ class Options(Record): determining whether an iname duplication is necessary for the kernel to be schedulable. + .. attribute:: eliminate_common_subscripts + + If ``True`` (the default), variable assignments with + precomputed subexpressions of array indices will be + emitted. + + .. versionadded:: 2016.3 + + .. ------------------------------------------------------------------------ + .. rubric:: Invocation-related options .. attribute:: skip_arg_checks @@ -145,6 +157,8 @@ class Options(Record): A :class:`bool`. Whether to allow colors in terminal output + .. ------------------------------------------------------------------------ + .. rubric:: Features .. attribute:: disable_global_barriers @@ -188,6 +202,8 @@ class Options(Record): trace_assignments=kwargs.get("trace_assignments", False), trace_assignment_values=kwargs.get("trace_assignment_values", False), ignore_boostable_into=kwargs.get("ignore_boostable_into", False), + eliminate_common_subscripts=kwargs.get("eliminate_common_subscripts", + False), skip_arg_checks=kwargs.get("skip_arg_checks", False), no_numpy=kwargs.get("no_numpy", False), diff --git a/loopy/symbolic.py b/loopy/symbolic.py index b1743cb1af927fe8a68d8566ea61ce4e511c5e1a..ce4f949939cd00cff8b25d7df4bec04253ddd0d2 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -179,10 +179,10 @@ class StringifyMapper(StringifyMapperBase): return "{%s}" % ", ".join(self.rec(ch) for ch in expr.children) def map_group_hw_index(self, expr, enclosing_prec): - return "grp.%d" % expr.index + return "grp.%d" % expr.axis def map_local_hw_index(self, expr, enclosing_prec): - return "loc.%d" % expr.index + return "loc.%d" % expr.axis def map_reduction(self, expr, prec): return "%sreduce(%s, [%s], %s)" % ( diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 5d5743bae322fc59c989cafd85122c8ca619c422..71daa13e78661ff8a38ec94edf278664330a77ef 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -213,7 +213,11 @@ class ASTBuilderBase(object): def emit_if(self, condition_str, ast): raise NotImplementedError() - def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): + def emit_scope(self, codegen_state, available_variables, ast): + return ast + + def emit_initializer(self, codegen_state, dtype, name, val_str, is_const, + short_for_expr=None): raise NotImplementedError() def emit_declaration_scope(self, codegen_state, inner): @@ -227,7 +231,7 @@ class ASTBuilderBase(object): # }}} - def process_ast(self, node): + def process_ast(self, codegen_state, node): return node @@ -287,7 +291,8 @@ class DummyHostASTBuilder(ASTBuilderBase): def emit_if(self, condition_str, ast): return None - def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): + def emit_initializer(self, codegen_state, dtype, name, val_str, is_const, + short_for_expr=None): return None def emit_blank_line(self): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index be83ec90c4720f10876e1a5e47a43c429fc40aeb..367dcd40b37577945f6d6bb5962f0b84af8f44de 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -1,4 +1,5 @@ -"""Plain C target and base for other C-family languages.""" +"""Target for C-family languages. Usable for outputting C and as a base \ +for other C-family languages.""" from __future__ import division, absolute_import @@ -33,6 +34,7 @@ from cgen import Pointer, NestedDeclarator, Block from cgen.mapper import IdentityMapper as CASTIdentityMapperBase from pymbolic.mapper.stringifier import PREC_NONE from loopy.symbolic import IdentityMapper +import cgen import pymbolic.primitives as p from pytools import memoize_method @@ -96,7 +98,7 @@ def _preamble_generator(preamble_info): # }}} -# {{{ cgen overrides +# {{{ cgen syntax tree from cgen import Declarator @@ -133,6 +135,44 @@ class POD(Declarator): mapper_method = "map_loopy_pod" +class ScopeASTNode(cgen.Generable): + def __init__(self, var_subst_map, available_variables, child): + self.var_subst_map = var_subst_map + self.available_variables = available_variables + self.child = child + + def generate(self): + for i in self.child.generate(): + yield i + + mapper_method = "map_loopy_scope" + + +class CgenLoopyLoopMixin(object): + def generate(self): + if self.intro_line() is not None: + yield self.intro_line() + + body = self.body + if isinstance(body, ScopeASTNode): + body = body.child + + from cgen import Block + if isinstance(body, Block): + for line in body.generate(): + yield line + else: + for line in body.generate(): + yield " "+line + + if self.outro_line() is not None: + yield self.outro_line() + + +class For(CgenLoopyLoopMixin, cgen.For): + pass + + class ScopingBlock(Block): """A block that is mandatory for scoping and may not be simplified away by :func:`loopy.codegen.results.merge_codegen_results`. @@ -243,12 +283,17 @@ class ASTSubscriptCollector(CASTIdentityMapper): # {{{ lazy expression generation class CExpression(object): - def __init__(self, to_code_mapper, expr): - self.to_code_mapper = to_code_mapper + def __init__(self, codegen_state, expr): + self.codegen_state = codegen_state self.expr = expr def __str__(self): - return self.to_code_mapper(self.expr, PREC_NONE) + to_code_mapper = \ + self.codegen_state.ast_builder.get_c_expression_to_code_mapper( + self.codegen_state) + return to_code_mapper( + self.expr, + PREC_NONE) # }}} @@ -535,9 +580,9 @@ class CASTBuilder(ASTBuilderBase): return ExpressionToCExpressionMapper( codegen_state, fortran_abi=self.target.fortran_abi) - def get_c_expression_to_code_mapper(self): + def get_c_expression_to_code_mapper(self, codegen_state): from loopy.target.c.codegen.expression import CExpressionToCodeMapper - return CExpressionToCodeMapper() + return CExpressionToCodeMapper(codegen_state) def get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info): temp_var_decl = POD(self, decl_info.dtype, decl_info.name) @@ -707,7 +752,9 @@ class CASTBuilder(ASTBuilderBase): if len(mangle_result.result_dtypes) == 0: from cgen import ExpressionStatement return ExpressionStatement( - CExpression(self.get_c_expression_to_code_mapper(), result)) + CExpression( + codegen_state, + result)) result = ecm.wrap_in_typecast( mangle_result.result_dtypes[0], @@ -719,7 +766,9 @@ class CASTBuilder(ASTBuilderBase): from cgen import Assign return Assign( lhs_code, - CExpression(self.get_c_expression_to_code_mapper(), result)) + CExpression( + codegen_state, + result)) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, lbound, ubound, inner): @@ -728,7 +777,7 @@ class CASTBuilder(ASTBuilderBase): from pymbolic import var from pymbolic.primitives import Comparison from pymbolic.mapper.stringifier import PREC_NONE - from cgen import For, InlineInitializer + from cgen import InlineInitializer return For( InlineInitializer( @@ -743,7 +792,8 @@ class CASTBuilder(ASTBuilderBase): "++%s" % iname, inner) - def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): + def emit_initializer(self, codegen_state, dtype, name, val_str, is_const, + short_for_expr=None): decl = POD(self, dtype, name) from cgen import Initializer, Const @@ -765,12 +815,14 @@ class CASTBuilder(ASTBuilderBase): from cgen import If return If(condition_str, ast) + def emit_scope(self, codegen_state, available_variables, ast): + return ScopeASTNode(codegen_state.var_subst_map, available_variables, ast) + # }}} - def process_ast(self, node): - sc = ASTSubscriptCollector() - sc(node) - return node + def process_ast(self, codegen_state, node): + from loopy.target.c.subscript_cse import eliminate_common_subscripts + return eliminate_common_subscripts(codegen_state, node=node) # {{{ header generation diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 68cc32e56be077c7e45d11b9e2aade86b04494cc..e166556f69d09862a4a379aa38e1a1264f634b85 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -112,7 +112,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): assert prec == PREC_NONE from loopy.target.c import CExpression return CExpression( - self.codegen_state.ast_builder.get_c_expression_to_code_mapper(), + self.codegen_state, self.rec(expr, type_context, needed_dtype)) # }}} @@ -122,15 +122,8 @@ class ExpressionToCExpressionMapper(IdentityMapper): return x if expr.name in self.codegen_state.var_subst_map: - if self.kernel.options.annotate_inames: - return var( - "/* %s */ %s" % ( - expr.name, - self.rec(self.codegen_state.var_subst_map[expr.name], - type_context))) - else: - return self.rec(self.codegen_state.var_subst_map[expr.name], - type_context) + return expr + elif expr.name in self.kernel.arg_dict: arg = self.kernel.arg_dict[expr.name] from loopy.kernel.array import ArrayBase @@ -662,10 +655,10 @@ class ExpressionToCExpressionMapper(IdentityMapper): # }}} def map_group_hw_index(self, expr, type_context): - raise LoopyError("plain C does not have group hw axes") + return expr def map_local_hw_index(self, expr, type_context): - raise LoopyError("plain C does not have local hw axes") + return expr # }}} @@ -673,6 +666,9 @@ class ExpressionToCExpressionMapper(IdentityMapper): # {{{ C expression to code mapper class CExpressionToCodeMapper(RecursiveMapper): + def __init__(self, codegen_state): + self.codegen_state = codegen_state + # {{{ helpers def parenthesize_if_needed(self, s, enclosing_prec, my_prec): @@ -714,6 +710,19 @@ class CExpressionToCodeMapper(RecursiveMapper): "entry to loopy") def map_variable(self, expr, enclosing_prec): + if expr.name in self.codegen_state.var_subst_map: + if self.codegen_state.kernel.options.annotate_inames: + return var( + "/* %s */ %s" % ( + expr.name, + self.rec( + self.codegen_state.var_subst_map[expr.name], + enclosing_prec))) + else: + return self.rec( + self.codegen_state.var_subst_map[expr.name], + enclosing_prec) + return expr.name def map_lookup(self, expr, enclosing_prec): @@ -838,6 +847,12 @@ class CExpressionToCodeMapper(RecursiveMapper): self.rec(expr.base, PREC_NONE), self.rec(expr.exponent, PREC_NONE)) + def map_group_hw_index(self, expr, enclosing_prec): + raise LoopyError("plain C does not have group hw axes") + + def map_local_hw_index(self, expr, enclosing_prec): + raise LoopyError("plain C does not have local hw axes") + def map_array_literal(self, expr, enclosing_prec): return "{ %s }" % self.join_rec(", ", expr.children, PREC_NONE) diff --git a/loopy/target/c/subscript_cse.py b/loopy/target/c/subscript_cse.py new file mode 100644 index 0000000000000000000000000000000000000000..0ffdd864a263cb67e6cf80935edaf064021660b6 --- /dev/null +++ b/loopy/target/c/subscript_cse.py @@ -0,0 +1,443 @@ +"""Common subexpression elimination in array subscripts.""" + +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2016 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six + +from cgen.mapper import IdentityMapper as CASTIdentityMapperBase +import pymbolic.primitives as p +from pytools import Record + +from loopy.symbolic import IdentityMapper as ExprIdentityMapper +from loopy.diagnostic import LoopyError +from loopy.target.c import CExpression + +import logging +logger = logging.getLogger(__name__) + + +# {{{ utilities + +class CASTIdentityMapper(CASTIdentityMapperBase): + def map_loopy_scope(self, node, *args, **kwargs): + return type(node)( + node.var_subst_map, + node.available_variables, + self.rec(node.child, *args, **kwargs)) + + def map_loopy_pod(self, node, *args, **kwargs): + return type(node)(node.ast_builder, node.dtype, node.name) + + +def generate_all_subsets(l, min_length): + for bits in range(2**len(l)): + if bin(bits).count("1") >= min_length: + yield frozenset(entry for i, entry in enumerate(l) if (1 << i) & bits) + + +def is_const_product(term): + return ( + p.is_constant(term) + or ( + isinstance(term, p.Product) + and all(is_const_product(ch) for ch in term.children))) + + +def get_terms(allowable_vars, expr): + if isinstance(expr, p.Sum): + terms = expr.children + else: + terms = (expr,) + + from loopy.symbolic import get_dependencies + + result = [] + remainder = [] + for term in terms: + deps = get_dependencies(term) + if (deps <= allowable_vars + and not is_const_product(term)): + result.append(term) + else: + remainder.append(term) + + return result, remainder + +# }}} + + +# {{{ counting + +class SubscriptSubsetCounter(ExprIdentityMapper): + def __init__(self, codegen_state, term_set_to_inside_inames_list, + inside_inames): + self.codegen_state = codegen_state + self.term_set_to_inside_inames_list = term_set_to_inside_inames_list + kernel = codegen_state.kernel + self.allowable_vars = kernel.all_inames() | kernel.outer_params() + self.inside_inames = inside_inames + + def map_subscript(self, expr): + terms, _ = get_terms(self.allowable_vars, expr.index) + terms = frozenset(terms) + self.term_set_to_inside_inames_list[terms] = ( + self.term_set_to_inside_inames_list.get(terms, []) + + [self.inside_inames]) + + +class ASTSubexpressionCollector(CASTIdentityMapper): + def __init__(self, codegen_state): + self.term_set_to_inside_inames_list = {} + self.codegen_state = codegen_state + self.inside_inames_stack = [] + + def map_loopy_scope(self, node): + if self.inside_inames_stack: + new_inside_inames = self.inside_inames_stack[-1] + else: + new_inside_inames = () + + new_inside_inames = ( + new_inside_inames + node.available_variables) + + self.inside_inames_stack.append(new_inside_inames) + result = super(ASTSubexpressionCollector, self).map_loopy_scope(node) + self.inside_inames_stack.pop() + return result + + def map_expression(self, expr): + from pymbolic.primitives import is_constant + if isinstance(expr, CExpression): + if self.inside_inames_stack: + inside_inames = self.inside_inames_stack[-1] + else: + inside_inames = () + count_mapper = SubscriptSubsetCounter( + self.codegen_state, + self.term_set_to_inside_inames_list, + inside_inames) + count_mapper(expr.expr) + return expr + elif isinstance(expr, str) or is_constant(expr): + return expr + else: + raise LoopyError( + "Unexpected expression type: %s" % type(expr).__name__) + +# }}} + + +# {{{ replacing + +class SubexpressionReplacementState(Record): + """ + .. attribute:: codegen_state + + .. attribute:: name_generator + + A callable that can generate new identifiers. + + .. attribute:: term_set_to_inside_inames_list + + A mapping from (summed) sets of subexpressions to a list of tuples of inames + within which the use is nested, one per use. + + .. attribute:: term_subset_to_inside_inames_list + + A mapping from (summed) subsets of subexpressions to their use counts. + + .. attribute:: available_variables + + A set of variables that subexpressions may refer to. + + .. attribute:: term_set_to_variable + + A mapping from term subsets to their replacement variable names. + """ + + +def is_simple(term): + from loopy.symbolic import HardwareAxisIndex + + if p.is_constant(term): + return True + + if (isinstance(term, p.Variable) + or isinstance(term, HardwareAxisIndex)): + return True + + if isinstance(term, p.Product): + n_constants = 0 + n_simple = 0 + n_other = 0 + + for ch in term.children: + if p.is_constant(ch): + n_constants += 1 + elif is_simple(ch): + n_simple += 1 + else: + n_other += 1 + + return n_other == 0 and n_simple <= 1 + + return False + + +def compute_term_subset_to_inside_inames_list( + term_set_to_inside_inames_list, term_set_to_variable): + logger.debug("TERM SET TO SUBSET COUNT:") + for term_set, in_iname_uses in six.iteritems(term_set_to_inside_inames_list): + logger.debug( + "%s: %d" % (" + ".join(str(i) for i in term_set), + len(in_iname_uses))) + + result = {} + for code_term_set, in_iname_uses in six.iteritems( + term_set_to_inside_inames_list): + logger.debug("CTS: " + " + ".join(str(i) for i in code_term_set)) + interacts_with_var_term_sets = [ + var_term_set + for var_term_set in six.iterkeys(term_set_to_variable) + if var_term_set <= code_term_set] + + logger.debug("INTERACTS: " + str(interacts_with_var_term_sets)) + for subset in generate_all_subsets(code_term_set, 1): + if len(subset) == 1: + term, = subset + if is_simple(term): + continue + + will_contribute = True + + for var_term_set in interacts_with_var_term_sets: + if (subset <= var_term_set + or (var_term_set & subset + and not var_term_set < subset)): + will_contribute = False + break + + if will_contribute: + result[subset] = result.get(subset, []) + in_iname_uses + + logger.debug("CTS DONE") + + logger.debug("TERM SUBSET TO COUNT:") + for term_set, in_iname_uses in six.iteritems(result): + logger.debug( + "%s: %d" % (" + ".join(str(i) for i in term_set), + len(in_iname_uses))) + + return result + + +def simplify_terms(terms, term_set_to_variable): + logger.debug("BUILDING EXPR FOR: " + "+".join(str(s) for s in terms)) + did_something = True + while did_something: + did_something = False + + for subset, var_name in sorted( + six.iteritems(term_set_to_variable), + # longest first + key=lambda entry: len(entry[0]), reverse=True): + if subset <= terms: + logger.debug("SIMPLIFYING " + "+".join(str(s) for s in subset) + + "->" + var_name) + terms = ( + (terms - subset) + | frozenset([p.Variable(var_name)])) + did_something = True + break + + logger.debug("GOT " + "+".join(str(s) for s in terms)) + + def term_sort_key(term): + return str(term) + + return sorted(terms, key=term_sort_key) + + +class SubscriptSubsetReplacer(ExprIdentityMapper): + def __init__(self, node_replacer, subex_rep_state): + self.node_replacer = node_replacer + self.subex_rep_state = subex_rep_state + + def _process_subscript(self, expr): + subex_rep_state = self.subex_rep_state + + iname_terms, remainder = get_terms( + subex_rep_state.codegen_state.kernel.all_inames(), + expr=expr.index) + return simplify_terms( + frozenset(iname_terms), + subex_rep_state.term_set_to_variable), remainder + + def map_subscript(self, expr): + iname_terms, remainder = self._process_subscript(expr) + + expr = type(expr)( + expr.aggregate, + p.Sum(tuple(iname_terms) + tuple(remainder))) + + return super(SubscriptSubsetReplacer, self).map_subscript(expr) + + +class ASTSubexpressionReplacer(CASTIdentityMapper): + def map_loopy_scope(self, node, subex_rep_state): + codegen_state = subex_rep_state.codegen_state.copy( + var_subst_map=node.var_subst_map) + + available_variables = ( + subex_rep_state.available_variables + | frozenset(node.available_variables)) + + subex_rep_state = subex_rep_state.copy( + available_variables=available_variables) + + term_set_to_variable = subex_rep_state.term_set_to_variable.copy() + term_subset_to_inside_inames_list = \ + subex_rep_state.term_subset_to_inside_inames_list + + from loopy.symbolic import get_dependencies + + from pytools import argmin2 + from cgen import Block + from loopy.target.c import ScopeASTNode + + initializers = [] + + def is_in_deeper_loop(in_iname_uses): + for iiu in in_iname_uses: + iiu = frozenset(iiu) + + if available_variables & iiu < iiu: # note: not equal! + return True + + return False + + while True: + eligible_subsets = frozenset( + term_set + for term_set, in_iname_uses in six.iteritems( + term_subset_to_inside_inames_list) + if all(get_dependencies(term) <= available_variables + for term in term_set) + if len(in_iname_uses) >= 2 # used more than once + or is_in_deeper_loop(in_iname_uses)) + + if not eligible_subsets: + break + + def get_name_sort_key(subset): + return (sorted(str(term) for term in subset)) + + # find the shortest, most-used subexpression + new_var_subset, _ = argmin2( + ((subset, + (len(subset), + -len(term_subset_to_inside_inames_list[subset]), + get_name_sort_key(subset))) + for subset in eligible_subsets), + return_value=True) + + var_name = subex_rep_state.name_generator("ind") + + old_var_expr = p.Sum(tuple(new_var_subset)) + new_var_expr = p.Sum(tuple( + simplify_terms(new_var_subset, term_set_to_variable))) + + term_set_to_variable[new_var_subset] = var_name + + initializers.append( + codegen_state.ast_builder.emit_initializer( + codegen_state, + codegen_state.kernel.index_dtype, + var_name, + CExpression( + codegen_state, + new_var_expr), + is_const=True, + short_for_expr=old_var_expr)) + + term_subset_to_inside_inames_list = \ + compute_term_subset_to_inside_inames_list( + subex_rep_state.term_set_to_inside_inames_list, + term_set_to_variable) + + # insert initializer code + if initializers: + subnode = node.child + if isinstance(subnode, Block): + subnode = Block(initializers + subnode.contents) + else: + subnode = Block(initializers+[subnode]) + node = ScopeASTNode( + codegen_state, node.available_variables, subnode) + + subex_rep_state = subex_rep_state.copy( + term_set_to_variable=term_set_to_variable, + term_subset_to_inside_inames_list=term_subset_to_inside_inames_list) + + return super(ASTSubexpressionReplacer, self).map_loopy_scope( + node, subex_rep_state) + + def map_expression(self, expr, subex_rep_state): + from pymbolic.primitives import is_constant + if isinstance(expr, CExpression): + ssr = SubscriptSubsetReplacer(self, subex_rep_state) + return CExpression( + expr.codegen_state, + ssr(expr.expr)) + elif isinstance(expr, str) or is_constant(expr): + return expr + else: + raise LoopyError( + "Unexpected expression type: %s" % type(expr).__name__) + +# }}} + + +def eliminate_common_subscripts(codegen_state, node): + if not codegen_state.kernel.options.eliminate_common_subscripts: + return node + + sc = ASTSubexpressionCollector(codegen_state) + sc(node) + + sr = ASTSubexpressionReplacer() + + term_set_to_variable = {} + subex_rep_state = SubexpressionReplacementState( + codegen_state=codegen_state, + name_generator=codegen_state.kernel.get_var_name_generator(), + term_set_to_inside_inames_list=sc.term_set_to_inside_inames_list, + available_variables=codegen_state.kernel.outer_params(), + term_set_to_variable=term_set_to_variable, + term_subset_to_inside_inames_list=( + compute_term_subset_to_inside_inames_list( + sc.term_set_to_inside_inames_list, term_set_to_variable))) + + return sr(node, subex_rep_state) diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 2bdffb5aa69bdc0f72fe12a58faa6d0e78920e0f..8def111be247ed2c3fc689098d48fc4540310933 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -29,11 +29,11 @@ import numpy as np from pytools import memoize_method from loopy.target.c import CTarget, CASTBuilder -from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper +from loopy.target.c.codegen.expression import ( + CExpressionToCodeMapper) from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.kernel.data import temp_var_scope -from pymbolic import var # {{{ vector types @@ -141,7 +141,7 @@ def cuda_function_mangler(kernel, name, arg_dtypes): # {{{ expression mapper -class ExpressionToCudaCExpressionMapper(ExpressionToCExpressionMapper): +class CUDACExpressionToCodeMapper(CExpressionToCodeMapper): _GRID_AXES = "xyz" @staticmethod @@ -153,15 +153,15 @@ class ExpressionToCudaCExpressionMapper(ExpressionToCExpressionMapper): else: raise LoopyError("unexpected index type") - def map_group_hw_index(self, expr, type_context): - return var("((%s) blockIdx.%s)" % ( - self._get_index_ctype(self.kernel), - self._GRID_AXES[expr.axis])) + def map_group_hw_index(self, expr, enclosing_prec): + return "((%s) blockIdx.%s)" % ( + self._get_index_ctype(self.codegen_state.kernel), + self._GRID_AXES[expr.axis]) - def map_local_hw_index(self, expr, type_context): - return var("((%s) threadIdx.%s)" % ( - self._get_index_ctype(self.kernel), - self._GRID_AXES[expr.axis])) + def map_local_hw_index(self, expr, enclosing_prec): + return "((%s) threadIdx.%s)" % ( + self._get_index_ctype(self.codegen_state.kernel), + self._GRID_AXES[expr.axis]) # }}} @@ -283,8 +283,8 @@ class CUDACASTBuilder(CASTBuilder): # {{{ code generation guts - def get_expression_to_c_expression_mapper(self, codegen_state): - return ExpressionToCudaCExpressionMapper(codegen_state) + def get_c_expression_to_code_mapper(self, codegen_state): + return CUDACExpressionToCodeMapper(codegen_state) _VEC_AXES = "xyzw" diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 80a69bd00c99258b709ea18b2a716c339b888b02..47ae531cbfe7f19eaae2618805fc6411e8bbc0d9 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -27,9 +27,10 @@ THE SOFTWARE. import numpy as np # noqa from loopy.target.c import CTarget, CASTBuilder -from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper +from loopy.target.c.codegen.expression import ( + ExpressionToCExpressionMapper, CExpressionToCodeMapper) from loopy.diagnostic import LoopyError -from loopy.symbolic import Literal +from loopy.symbolic import Literal, LocalHardwareAxisIndex, CombineMapper from pymbolic import var import pymbolic.primitives as p from pymbolic.mapper.stringifier import PREC_NONE @@ -37,28 +38,18 @@ from pymbolic.mapper.stringifier import PREC_NONE from pytools import memoize_method -# {{{ expression mapper +# {{{ expression -> C Expression mapper -class ExprToISPCExprMapper(ExpressionToCExpressionMapper): - def _get_index_ctype(self): - if self.kernel.index_dtype.numpy_dtype == np.int32: - return "int32" - elif self.kernel.index_dtype.numpy_dtype == np.int64: - return "int64" - else: - raise ValueError("unexpected index_type") +def _multiply_terms(expr, factor): + if isinstance(expr, p.Sum): + terms = expr.children + else: + terms = (expr,) - def map_group_hw_index(self, expr, type_context): - return var( - "((uniform %s) taskIndex%d)" - % (self._get_index_ctype(), expr.axis)) + return p.flattened_sum(p.flattened_product((factor, term)) for term in terms) - def map_local_hw_index(self, expr, type_context): - if expr.axis == 0: - return var("(varying %s) programIndex" % self._get_index_ctype()) - else: - raise LoopyError("ISPC only supports one local axis") +class ExprToISPCExprMapper(ExpressionToCExpressionMapper): def map_constant(self, expr, type_context): if isinstance(expr, (complex, np.complexfloating)): raise NotImplementedError("complex numbers in ispc") @@ -88,7 +79,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): # below in decl generation) gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs() if lsize: - return expr[var("programIndex")] + return expr[LocalHardwareAxisIndex(0)] else: return expr @@ -114,7 +105,10 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): subscript, = access_info.subscripts result = var(access_info.array_name)[ - var("programIndex") + self.rec(lsize*subscript, 'i')] + LocalHardwareAxisIndex(0) + + _multiply_terms( + self.rec(subscript, 'i'), + lsize)] if access_info.vector_index is not None: return self.kernel.target.add_vector_access( @@ -128,6 +122,25 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): # }}} +class ISPCExprToCodeMapper(CExpressionToCodeMapper): + def _get_index_ctype(self): + if self.codegen_state.kernel.index_dtype.numpy_dtype == np.int32: + return "int32" + elif self.codegen_state.kernel.index_dtype.numpy_dtype == np.int64: + return "int64" + else: + raise ValueError("unexpected index_type") + + def map_group_hw_index(self, expr, enclosing_prec): + return "((uniform %s) taskIndex%d)" % (self._get_index_ctype(), expr.axis) + + def map_local_hw_index(self, expr, enclosing_prec): + if expr.axis == 0: + return "(varying %s) programIndex" % self._get_index_ctype() + else: + raise LoopyError("ISPC only supports one local axis") + + # {{{ type registry def fill_registry_with_ispc_types(reg, respect_windows, include_bool=True): @@ -151,6 +164,30 @@ def fill_registry_with_ispc_types(reg, respect_windows, include_bool=True): # }}} +class HasProgramIndexMapper(CombineMapper): + def __init__(self, kernel): + self.kernel = kernel + + def combine(self, values): + return any(values) + + def map_constant(self, expr): + return False + + def map_variable(self, expr): + from loopy.kernel.data import LocalIndexTagBase + return ( + isinstance( + self.kernel.iname_to_tag.get(expr.name, None), + LocalIndexTagBase)) + + def map_group_hw_index(self, expr): + return False + + def map_local_hw_index(self, expr): + return True + + class ISPCTarget(CTarget): """A code generation target for Intel's `ISPC `_ SPMD programming language, to target Intel's Knight's hardware and modern @@ -282,6 +319,9 @@ class ISPCASTBuilder(CASTBuilder): def get_expression_to_c_expression_mapper(self, codegen_state): return ExprToISPCExprMapper(codegen_state) + def get_c_expression_to_code_mapper(self, codegen_state): + return ISPCExprToCodeMapper(codegen_state) + def add_vector_access(self, access_expr, index): return access_expr[index] @@ -475,10 +515,9 @@ class ISPCASTBuilder(CASTBuilder): lbound, ubound, inner): ecm = codegen_state.expression_to_code_mapper - from loopy.target.c import POD - + from loopy.target.c import POD, For from pymbolic.mapper.stringifier import PREC_NONE - from cgen import For, InlineInitializer + from cgen import InlineInitializer from cgen.ispc import ISPCUniform @@ -491,8 +530,36 @@ class ISPCASTBuilder(CASTBuilder): PREC_NONE, "i"), "++%s" % iname, inner) + + def emit_initializer(self, codegen_state, dtype, name, val, is_const, + short_for_expr=None): + from cgen.ispc import ISPCUniform, ISPCVarying + from loopy.target.c import POD + + rhs_has_programindex = False + if short_for_expr is not None: + rhs_has_programindex = HasProgramIndexMapper( + codegen_state.kernel)(short_for_expr) + + decl = POD(self, dtype, name) + if rhs_has_programindex: + decl = ISPCVarying(decl) + else: + decl = ISPCUniform(decl) + + from cgen import Initializer, Const + + if is_const: + decl = Const(decl) + + return Initializer(decl, val) + # }}} + def process_ast(self, codegen_state, node): + from loopy.target.c.subscript_cse import eliminate_common_subscripts + return eliminate_common_subscripts(codegen_state, node) + # TODO: Generate launch code # TODO: Vector types (element access: done) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 31cf7c6b648ebf370a17d8beb2538b9748ddb30a..d5c2922aa8445d9593786489cdac5f16fe09b77e 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -27,13 +27,12 @@ THE SOFTWARE. import numpy as np from loopy.target.c import CTarget, CASTBuilder -from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper +from loopy.target.c.codegen.expression import CExpressionToCodeMapper from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.target.c import DTypeRegistryWrapper from loopy.kernel.data import temp_var_scope, CallMangleInfo -from pymbolic import var # {{{ dtype registry wrappers @@ -297,12 +296,12 @@ def opencl_preamble_generator(preamble_info): # {{{ expression mapper -class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper): - def map_group_hw_index(self, expr, type_context): - return var("gid")(expr.axis) +class CLExpressionToCodeMapper(CExpressionToCodeMapper): + def map_group_hw_index(self, expr, enclosing_prec): + return "gid(%d)" % expr.axis - def map_local_hw_index(self, expr, type_context): - return var("lid")(expr.axis) + def map_local_hw_index(self, expr, enclosing_prec): + return "lid(%d)" % expr.axis # }}} @@ -364,8 +363,6 @@ class OpenCLTarget(CTarget): # }}} -# }}} - # {{{ ast builder @@ -438,8 +435,8 @@ class OpenCLCASTBuilder(CASTBuilder): # {{{ code generation guts - def get_expression_to_c_expression_mapper(self, codegen_state): - return ExpressionToOpenCLCExpressionMapper(codegen_state) + def get_c_expression_to_code_mapper(self, codegen_state): + return CLExpressionToCodeMapper(codegen_state) def add_vector_access(self, access_expr, index): # The 'int' avoids an 'L' suffix for long ints. diff --git a/loopy/target/python.py b/loopy/target/python.py index 09a86665b7d949d7bf35b910cd2a6fd66109c1ec..7212593f870e5ac3874cbf7c69eca959d57deff6 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -269,7 +269,8 @@ class PythonASTBuilderBase(ASTBuilderBase): ), inner) - def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): + def emit_initializer(self, codegen_state, dtype, name, val_str, is_const, + short_for_expr=None): from genpy import Assign return Assign(name, val_str) diff --git a/test/test_loopy.py b/test/test_loopy.py index af4269047539b800a5fd389f9293f11551c9a291..e0b2aa94344dee4b65d05dabb78710453624afa6 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1598,18 +1598,26 @@ def test_missing_global_barrier(): def test_index_cse(ctx_factory): + ctx = ctx_factory() + knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m