diff --git a/loopy/__init__.py b/loopy/__init__.py
index 6bd764f8df93f1b4b2ae5755c1c90ccddc654fe6..6cbb3362ef91b27c3b7b1cf6a591f7f9a20c2f7a 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -457,8 +457,8 @@ def _set_up_default_target():
set_default_target(target)
-_set_up_default_target()
+_set_up_default_target()
# }}}
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 8ac963835ec12702f2010806d1d49062422318a2..fd36a5bee2bbe88c3e763410271620640c847778 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -148,8 +148,14 @@ def generate_unroll_loop(codegen_state, sched_index):
for i in range(length):
idx_aff = lower_bound_aff + i
new_codegen_state = codegen_state.fix(iname, idx_aff)
- result.append(
- build_loop_nest(new_codegen_state, sched_index+1))
+ inner = build_loop_nest(new_codegen_state, sched_index+1)
+ inner = inner.with_new_ast(
+ new_codegen_state,
+ codegen_state.ast_builder.emit_scope(
+ new_codegen_state,
+ (iname,),
+ inner.current_ast(new_codegen_state)))
+ result.append(inner)
return merge_codegen_results(codegen_state, result)
@@ -236,17 +242,29 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
from loopy.schedule import get_insn_ids_for_block_at
insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index)
- if hw_inames_left is None:
- all_inames_by_insns = set()
+ def get_all_inames_by_insns():
+ result = set()
for insn_id in insn_ids_for_block:
- all_inames_by_insns |= kernel.insn_inames(insn_id)
+ result |= kernel.insn_inames(insn_id)
+ return result
+ if hw_inames_left is None:
hw_inames_left = [iname
- for iname in all_inames_by_insns
+ for iname in get_all_inames_by_insns()
if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)]
if not hw_inames_left:
- return next_func(codegen_state)
+ hw_inames = tuple(iname
+ for iname in get_all_inames_by_insns()
+ if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag))
+
+ inner = next_func(codegen_state)
+ return inner.with_new_ast(
+ codegen_state,
+ codegen_state.ast_builder.emit_scope(
+ codegen_state,
+ hw_inames,
+ inner.current_ast(codegen_state)))
global_size, local_size = kernel.get_grid_sizes_for_insn_ids(
insn_ids_for_block)
@@ -423,6 +441,12 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index):
kernel, slab, iname)))
inner = build_loop_nest(new_codegen_state, sched_index+1)
+ inner = inner.with_new_ast(
+ new_codegen_state,
+ codegen_state.ast_builder.emit_scope(
+ new_codegen_state,
+ (loop_iname,),
+ inner.current_ast(new_codegen_state)))
# }}}
diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index c683d120dbf6a1205618f8835e8f9c72dd13adf7..df43159f410b69c734b053268b3dc62a2a983840 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -300,7 +300,7 @@ def generate_host_or_device_program(codegen_state, schedule_index):
collapse=False)
cur_prog = codegen_result.current_program(codegen_state)
- body_ast = cur_prog.ast
+ body_ast = ast_builder.process_ast(codegen_state, cur_prog.ast)
fdecl_ast = ast_builder.get_function_declaration(
codegen_state, codegen_result, schedule_index)
@@ -311,8 +311,8 @@ def generate_host_or_device_program(codegen_state, schedule_index):
codegen_result = codegen_result.with_new_program(
codegen_state,
cur_prog.copy(
- ast=ast_builder.process_ast(fdef_ast),
- body_ast=ast_builder.process_ast(body_ast)))
+ ast=fdef_ast,
+ body_ast=body_ast))
return codegen_result
diff --git a/loopy/options.py b/loopy/options.py
index c88c512cb332ceec4587fd3c5011b9f729cad7d5..cfafd443876f8f76de480340e123a8dc5a39b460 100644
--- a/loopy/options.py
+++ b/loopy/options.py
@@ -70,6 +70,8 @@ class Options(Record):
Unless otherwise specified, these options are Boolean-valued
(i.e. on/off).
+ .. ------------------------------------------------------------------------
+
.. rubric:: Code-generation options
.. attribute:: annotate_inames
@@ -95,6 +97,16 @@ class Options(Record):
determining whether an iname duplication is necessary
for the kernel to be schedulable.
+ .. attribute:: eliminate_common_subscripts
+
+ If ``True`` (the default), variable assignments with
+ precomputed subexpressions of array indices will be
+ emitted.
+
+ .. versionadded:: 2016.3
+
+ .. ------------------------------------------------------------------------
+
.. rubric:: Invocation-related options
.. attribute:: skip_arg_checks
@@ -145,6 +157,8 @@ class Options(Record):
A :class:`bool`. Whether to allow colors in terminal output
+ .. ------------------------------------------------------------------------
+
.. rubric:: Features
.. attribute:: disable_global_barriers
@@ -188,6 +202,8 @@ class Options(Record):
trace_assignments=kwargs.get("trace_assignments", False),
trace_assignment_values=kwargs.get("trace_assignment_values", False),
ignore_boostable_into=kwargs.get("ignore_boostable_into", False),
+ eliminate_common_subscripts=kwargs.get("eliminate_common_subscripts",
+ False),
skip_arg_checks=kwargs.get("skip_arg_checks", False),
no_numpy=kwargs.get("no_numpy", False),
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index b1743cb1af927fe8a68d8566ea61ce4e511c5e1a..ce4f949939cd00cff8b25d7df4bec04253ddd0d2 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -179,10 +179,10 @@ class StringifyMapper(StringifyMapperBase):
return "{%s}" % ", ".join(self.rec(ch) for ch in expr.children)
def map_group_hw_index(self, expr, enclosing_prec):
- return "grp.%d" % expr.index
+ return "grp.%d" % expr.axis
def map_local_hw_index(self, expr, enclosing_prec):
- return "loc.%d" % expr.index
+ return "loc.%d" % expr.axis
def map_reduction(self, expr, prec):
return "%sreduce(%s, [%s], %s)" % (
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 5d5743bae322fc59c989cafd85122c8ca619c422..71daa13e78661ff8a38ec94edf278664330a77ef 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -213,7 +213,11 @@ class ASTBuilderBase(object):
def emit_if(self, condition_str, ast):
raise NotImplementedError()
- def emit_initializer(self, codegen_state, dtype, name, val_str, is_const):
+ def emit_scope(self, codegen_state, available_variables, ast):
+ return ast
+
+ def emit_initializer(self, codegen_state, dtype, name, val_str, is_const,
+ short_for_expr=None):
raise NotImplementedError()
def emit_declaration_scope(self, codegen_state, inner):
@@ -227,7 +231,7 @@ class ASTBuilderBase(object):
# }}}
- def process_ast(self, node):
+ def process_ast(self, codegen_state, node):
return node
@@ -287,7 +291,8 @@ class DummyHostASTBuilder(ASTBuilderBase):
def emit_if(self, condition_str, ast):
return None
- def emit_initializer(self, codegen_state, dtype, name, val_str, is_const):
+ def emit_initializer(self, codegen_state, dtype, name, val_str, is_const,
+ short_for_expr=None):
return None
def emit_blank_line(self):
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index be83ec90c4720f10876e1a5e47a43c429fc40aeb..367dcd40b37577945f6d6bb5962f0b84af8f44de 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -1,4 +1,5 @@
-"""Plain C target and base for other C-family languages."""
+"""Target for C-family languages. Usable for outputting C and as a base \
+for other C-family languages."""
from __future__ import division, absolute_import
@@ -33,6 +34,7 @@ from cgen import Pointer, NestedDeclarator, Block
from cgen.mapper import IdentityMapper as CASTIdentityMapperBase
from pymbolic.mapper.stringifier import PREC_NONE
from loopy.symbolic import IdentityMapper
+import cgen
import pymbolic.primitives as p
from pytools import memoize_method
@@ -96,7 +98,7 @@ def _preamble_generator(preamble_info):
# }}}
-# {{{ cgen overrides
+# {{{ cgen syntax tree
from cgen import Declarator
@@ -133,6 +135,44 @@ class POD(Declarator):
mapper_method = "map_loopy_pod"
+class ScopeASTNode(cgen.Generable):
+ def __init__(self, var_subst_map, available_variables, child):
+ self.var_subst_map = var_subst_map
+ self.available_variables = available_variables
+ self.child = child
+
+ def generate(self):
+ for i in self.child.generate():
+ yield i
+
+ mapper_method = "map_loopy_scope"
+
+
+class CgenLoopyLoopMixin(object):
+ def generate(self):
+ if self.intro_line() is not None:
+ yield self.intro_line()
+
+ body = self.body
+ if isinstance(body, ScopeASTNode):
+ body = body.child
+
+ from cgen import Block
+ if isinstance(body, Block):
+ for line in body.generate():
+ yield line
+ else:
+ for line in body.generate():
+ yield " "+line
+
+ if self.outro_line() is not None:
+ yield self.outro_line()
+
+
+class For(CgenLoopyLoopMixin, cgen.For):
+ pass
+
+
class ScopingBlock(Block):
"""A block that is mandatory for scoping and may not be simplified away
by :func:`loopy.codegen.results.merge_codegen_results`.
@@ -243,12 +283,17 @@ class ASTSubscriptCollector(CASTIdentityMapper):
# {{{ lazy expression generation
class CExpression(object):
- def __init__(self, to_code_mapper, expr):
- self.to_code_mapper = to_code_mapper
+ def __init__(self, codegen_state, expr):
+ self.codegen_state = codegen_state
self.expr = expr
def __str__(self):
- return self.to_code_mapper(self.expr, PREC_NONE)
+ to_code_mapper = \
+ self.codegen_state.ast_builder.get_c_expression_to_code_mapper(
+ self.codegen_state)
+ return to_code_mapper(
+ self.expr,
+ PREC_NONE)
# }}}
@@ -535,9 +580,9 @@ class CASTBuilder(ASTBuilderBase):
return ExpressionToCExpressionMapper(
codegen_state, fortran_abi=self.target.fortran_abi)
- def get_c_expression_to_code_mapper(self):
+ def get_c_expression_to_code_mapper(self, codegen_state):
from loopy.target.c.codegen.expression import CExpressionToCodeMapper
- return CExpressionToCodeMapper()
+ return CExpressionToCodeMapper(codegen_state)
def get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info):
temp_var_decl = POD(self, decl_info.dtype, decl_info.name)
@@ -707,7 +752,9 @@ class CASTBuilder(ASTBuilderBase):
if len(mangle_result.result_dtypes) == 0:
from cgen import ExpressionStatement
return ExpressionStatement(
- CExpression(self.get_c_expression_to_code_mapper(), result))
+ CExpression(
+ codegen_state,
+ result))
result = ecm.wrap_in_typecast(
mangle_result.result_dtypes[0],
@@ -719,7 +766,9 @@ class CASTBuilder(ASTBuilderBase):
from cgen import Assign
return Assign(
lhs_code,
- CExpression(self.get_c_expression_to_code_mapper(), result))
+ CExpression(
+ codegen_state,
+ result))
def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
lbound, ubound, inner):
@@ -728,7 +777,7 @@ class CASTBuilder(ASTBuilderBase):
from pymbolic import var
from pymbolic.primitives import Comparison
from pymbolic.mapper.stringifier import PREC_NONE
- from cgen import For, InlineInitializer
+ from cgen import InlineInitializer
return For(
InlineInitializer(
@@ -743,7 +792,8 @@ class CASTBuilder(ASTBuilderBase):
"++%s" % iname,
inner)
- def emit_initializer(self, codegen_state, dtype, name, val_str, is_const):
+ def emit_initializer(self, codegen_state, dtype, name, val_str, is_const,
+ short_for_expr=None):
decl = POD(self, dtype, name)
from cgen import Initializer, Const
@@ -765,12 +815,14 @@ class CASTBuilder(ASTBuilderBase):
from cgen import If
return If(condition_str, ast)
+ def emit_scope(self, codegen_state, available_variables, ast):
+ return ScopeASTNode(codegen_state.var_subst_map, available_variables, ast)
+
# }}}
- def process_ast(self, node):
- sc = ASTSubscriptCollector()
- sc(node)
- return node
+ def process_ast(self, codegen_state, node):
+ from loopy.target.c.subscript_cse import eliminate_common_subscripts
+ return eliminate_common_subscripts(codegen_state, node=node)
# {{{ header generation
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 68cc32e56be077c7e45d11b9e2aade86b04494cc..e166556f69d09862a4a379aa38e1a1264f634b85 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -112,7 +112,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
assert prec == PREC_NONE
from loopy.target.c import CExpression
return CExpression(
- self.codegen_state.ast_builder.get_c_expression_to_code_mapper(),
+ self.codegen_state,
self.rec(expr, type_context, needed_dtype))
# }}}
@@ -122,15 +122,8 @@ class ExpressionToCExpressionMapper(IdentityMapper):
return x
if expr.name in self.codegen_state.var_subst_map:
- if self.kernel.options.annotate_inames:
- return var(
- "/* %s */ %s" % (
- expr.name,
- self.rec(self.codegen_state.var_subst_map[expr.name],
- type_context)))
- else:
- return self.rec(self.codegen_state.var_subst_map[expr.name],
- type_context)
+ return expr
+
elif expr.name in self.kernel.arg_dict:
arg = self.kernel.arg_dict[expr.name]
from loopy.kernel.array import ArrayBase
@@ -662,10 +655,10 @@ class ExpressionToCExpressionMapper(IdentityMapper):
# }}}
def map_group_hw_index(self, expr, type_context):
- raise LoopyError("plain C does not have group hw axes")
+ return expr
def map_local_hw_index(self, expr, type_context):
- raise LoopyError("plain C does not have local hw axes")
+ return expr
# }}}
@@ -673,6 +666,9 @@ class ExpressionToCExpressionMapper(IdentityMapper):
# {{{ C expression to code mapper
class CExpressionToCodeMapper(RecursiveMapper):
+ def __init__(self, codegen_state):
+ self.codegen_state = codegen_state
+
# {{{ helpers
def parenthesize_if_needed(self, s, enclosing_prec, my_prec):
@@ -714,6 +710,19 @@ class CExpressionToCodeMapper(RecursiveMapper):
"entry to loopy")
def map_variable(self, expr, enclosing_prec):
+ if expr.name in self.codegen_state.var_subst_map:
+ if self.codegen_state.kernel.options.annotate_inames:
+ return var(
+ "/* %s */ %s" % (
+ expr.name,
+ self.rec(
+ self.codegen_state.var_subst_map[expr.name],
+ enclosing_prec)))
+ else:
+ return self.rec(
+ self.codegen_state.var_subst_map[expr.name],
+ enclosing_prec)
+
return expr.name
def map_lookup(self, expr, enclosing_prec):
@@ -838,6 +847,12 @@ class CExpressionToCodeMapper(RecursiveMapper):
self.rec(expr.base, PREC_NONE),
self.rec(expr.exponent, PREC_NONE))
+ def map_group_hw_index(self, expr, enclosing_prec):
+ raise LoopyError("plain C does not have group hw axes")
+
+ def map_local_hw_index(self, expr, enclosing_prec):
+ raise LoopyError("plain C does not have local hw axes")
+
def map_array_literal(self, expr, enclosing_prec):
return "{ %s }" % self.join_rec(", ", expr.children, PREC_NONE)
diff --git a/loopy/target/c/subscript_cse.py b/loopy/target/c/subscript_cse.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ffdd864a263cb67e6cf80935edaf064021660b6
--- /dev/null
+++ b/loopy/target/c/subscript_cse.py
@@ -0,0 +1,443 @@
+"""Common subexpression elimination in array subscripts."""
+
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2016 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import six
+
+from cgen.mapper import IdentityMapper as CASTIdentityMapperBase
+import pymbolic.primitives as p
+from pytools import Record
+
+from loopy.symbolic import IdentityMapper as ExprIdentityMapper
+from loopy.diagnostic import LoopyError
+from loopy.target.c import CExpression
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+# {{{ utilities
+
+class CASTIdentityMapper(CASTIdentityMapperBase):
+ def map_loopy_scope(self, node, *args, **kwargs):
+ return type(node)(
+ node.var_subst_map,
+ node.available_variables,
+ self.rec(node.child, *args, **kwargs))
+
+ def map_loopy_pod(self, node, *args, **kwargs):
+ return type(node)(node.ast_builder, node.dtype, node.name)
+
+
+def generate_all_subsets(l, min_length):
+ for bits in range(2**len(l)):
+ if bin(bits).count("1") >= min_length:
+ yield frozenset(entry for i, entry in enumerate(l) if (1 << i) & bits)
+
+
+def is_const_product(term):
+ return (
+ p.is_constant(term)
+ or (
+ isinstance(term, p.Product)
+ and all(is_const_product(ch) for ch in term.children)))
+
+
+def get_terms(allowable_vars, expr):
+ if isinstance(expr, p.Sum):
+ terms = expr.children
+ else:
+ terms = (expr,)
+
+ from loopy.symbolic import get_dependencies
+
+ result = []
+ remainder = []
+ for term in terms:
+ deps = get_dependencies(term)
+ if (deps <= allowable_vars
+ and not is_const_product(term)):
+ result.append(term)
+ else:
+ remainder.append(term)
+
+ return result, remainder
+
+# }}}
+
+
+# {{{ counting
+
+class SubscriptSubsetCounter(ExprIdentityMapper):
+ def __init__(self, codegen_state, term_set_to_inside_inames_list,
+ inside_inames):
+ self.codegen_state = codegen_state
+ self.term_set_to_inside_inames_list = term_set_to_inside_inames_list
+ kernel = codegen_state.kernel
+ self.allowable_vars = kernel.all_inames() | kernel.outer_params()
+ self.inside_inames = inside_inames
+
+ def map_subscript(self, expr):
+ terms, _ = get_terms(self.allowable_vars, expr.index)
+ terms = frozenset(terms)
+ self.term_set_to_inside_inames_list[terms] = (
+ self.term_set_to_inside_inames_list.get(terms, [])
+ + [self.inside_inames])
+
+
+class ASTSubexpressionCollector(CASTIdentityMapper):
+ def __init__(self, codegen_state):
+ self.term_set_to_inside_inames_list = {}
+ self.codegen_state = codegen_state
+ self.inside_inames_stack = []
+
+ def map_loopy_scope(self, node):
+ if self.inside_inames_stack:
+ new_inside_inames = self.inside_inames_stack[-1]
+ else:
+ new_inside_inames = ()
+
+ new_inside_inames = (
+ new_inside_inames + node.available_variables)
+
+ self.inside_inames_stack.append(new_inside_inames)
+ result = super(ASTSubexpressionCollector, self).map_loopy_scope(node)
+ self.inside_inames_stack.pop()
+ return result
+
+ def map_expression(self, expr):
+ from pymbolic.primitives import is_constant
+ if isinstance(expr, CExpression):
+ if self.inside_inames_stack:
+ inside_inames = self.inside_inames_stack[-1]
+ else:
+ inside_inames = ()
+ count_mapper = SubscriptSubsetCounter(
+ self.codegen_state,
+ self.term_set_to_inside_inames_list,
+ inside_inames)
+ count_mapper(expr.expr)
+ return expr
+ elif isinstance(expr, str) or is_constant(expr):
+ return expr
+ else:
+ raise LoopyError(
+ "Unexpected expression type: %s" % type(expr).__name__)
+
+# }}}
+
+
+# {{{ replacing
+
+class SubexpressionReplacementState(Record):
+ """
+ .. attribute:: codegen_state
+
+ .. attribute:: name_generator
+
+ A callable that can generate new identifiers.
+
+ .. attribute:: term_set_to_inside_inames_list
+
+ A mapping from (summed) sets of subexpressions to a list of tuples of inames
+ within which the use is nested, one per use.
+
+ .. attribute:: term_subset_to_inside_inames_list
+
+ A mapping from (summed) subsets of subexpressions to their use counts.
+
+ .. attribute:: available_variables
+
+ A set of variables that subexpressions may refer to.
+
+ .. attribute:: term_set_to_variable
+
+ A mapping from term subsets to their replacement variable names.
+ """
+
+
+def is_simple(term):
+ from loopy.symbolic import HardwareAxisIndex
+
+ if p.is_constant(term):
+ return True
+
+ if (isinstance(term, p.Variable)
+ or isinstance(term, HardwareAxisIndex)):
+ return True
+
+ if isinstance(term, p.Product):
+ n_constants = 0
+ n_simple = 0
+ n_other = 0
+
+ for ch in term.children:
+ if p.is_constant(ch):
+ n_constants += 1
+ elif is_simple(ch):
+ n_simple += 1
+ else:
+ n_other += 1
+
+ return n_other == 0 and n_simple <= 1
+
+ return False
+
+
+def compute_term_subset_to_inside_inames_list(
+ term_set_to_inside_inames_list, term_set_to_variable):
+ logger.debug("TERM SET TO SUBSET COUNT:")
+ for term_set, in_iname_uses in six.iteritems(term_set_to_inside_inames_list):
+ logger.debug(
+ "%s: %d" % (" + ".join(str(i) for i in term_set),
+ len(in_iname_uses)))
+
+ result = {}
+ for code_term_set, in_iname_uses in six.iteritems(
+ term_set_to_inside_inames_list):
+ logger.debug("CTS: " + " + ".join(str(i) for i in code_term_set))
+ interacts_with_var_term_sets = [
+ var_term_set
+ for var_term_set in six.iterkeys(term_set_to_variable)
+ if var_term_set <= code_term_set]
+
+ logger.debug("INTERACTS: " + str(interacts_with_var_term_sets))
+ for subset in generate_all_subsets(code_term_set, 1):
+ if len(subset) == 1:
+ term, = subset
+ if is_simple(term):
+ continue
+
+ will_contribute = True
+
+ for var_term_set in interacts_with_var_term_sets:
+ if (subset <= var_term_set
+ or (var_term_set & subset
+ and not var_term_set < subset)):
+ will_contribute = False
+ break
+
+ if will_contribute:
+ result[subset] = result.get(subset, []) + in_iname_uses
+
+ logger.debug("CTS DONE")
+
+ logger.debug("TERM SUBSET TO COUNT:")
+ for term_set, in_iname_uses in six.iteritems(result):
+ logger.debug(
+ "%s: %d" % (" + ".join(str(i) for i in term_set),
+ len(in_iname_uses)))
+
+ return result
+
+
+def simplify_terms(terms, term_set_to_variable):
+ logger.debug("BUILDING EXPR FOR: " + "+".join(str(s) for s in terms))
+ did_something = True
+ while did_something:
+ did_something = False
+
+ for subset, var_name in sorted(
+ six.iteritems(term_set_to_variable),
+ # longest first
+ key=lambda entry: len(entry[0]), reverse=True):
+ if subset <= terms:
+ logger.debug("SIMPLIFYING " + "+".join(str(s) for s in subset)
+ + "->" + var_name)
+ terms = (
+ (terms - subset)
+ | frozenset([p.Variable(var_name)]))
+ did_something = True
+ break
+
+ logger.debug("GOT " + "+".join(str(s) for s in terms))
+
+ def term_sort_key(term):
+ return str(term)
+
+ return sorted(terms, key=term_sort_key)
+
+
+class SubscriptSubsetReplacer(ExprIdentityMapper):
+ def __init__(self, node_replacer, subex_rep_state):
+ self.node_replacer = node_replacer
+ self.subex_rep_state = subex_rep_state
+
+ def _process_subscript(self, expr):
+ subex_rep_state = self.subex_rep_state
+
+ iname_terms, remainder = get_terms(
+ subex_rep_state.codegen_state.kernel.all_inames(),
+ expr=expr.index)
+ return simplify_terms(
+ frozenset(iname_terms),
+ subex_rep_state.term_set_to_variable), remainder
+
+ def map_subscript(self, expr):
+ iname_terms, remainder = self._process_subscript(expr)
+
+ expr = type(expr)(
+ expr.aggregate,
+ p.Sum(tuple(iname_terms) + tuple(remainder)))
+
+ return super(SubscriptSubsetReplacer, self).map_subscript(expr)
+
+
+class ASTSubexpressionReplacer(CASTIdentityMapper):
+ def map_loopy_scope(self, node, subex_rep_state):
+ codegen_state = subex_rep_state.codegen_state.copy(
+ var_subst_map=node.var_subst_map)
+
+ available_variables = (
+ subex_rep_state.available_variables
+ | frozenset(node.available_variables))
+
+ subex_rep_state = subex_rep_state.copy(
+ available_variables=available_variables)
+
+ term_set_to_variable = subex_rep_state.term_set_to_variable.copy()
+ term_subset_to_inside_inames_list = \
+ subex_rep_state.term_subset_to_inside_inames_list
+
+ from loopy.symbolic import get_dependencies
+
+ from pytools import argmin2
+ from cgen import Block
+ from loopy.target.c import ScopeASTNode
+
+ initializers = []
+
+ def is_in_deeper_loop(in_iname_uses):
+ for iiu in in_iname_uses:
+ iiu = frozenset(iiu)
+
+ if available_variables & iiu < iiu: # note: not equal!
+ return True
+
+ return False
+
+ while True:
+ eligible_subsets = frozenset(
+ term_set
+ for term_set, in_iname_uses in six.iteritems(
+ term_subset_to_inside_inames_list)
+ if all(get_dependencies(term) <= available_variables
+ for term in term_set)
+ if len(in_iname_uses) >= 2 # used more than once
+ or is_in_deeper_loop(in_iname_uses))
+
+ if not eligible_subsets:
+ break
+
+ def get_name_sort_key(subset):
+ return (sorted(str(term) for term in subset))
+
+ # find the shortest, most-used subexpression
+ new_var_subset, _ = argmin2(
+ ((subset,
+ (len(subset),
+ -len(term_subset_to_inside_inames_list[subset]),
+ get_name_sort_key(subset)))
+ for subset in eligible_subsets),
+ return_value=True)
+
+ var_name = subex_rep_state.name_generator("ind")
+
+ old_var_expr = p.Sum(tuple(new_var_subset))
+ new_var_expr = p.Sum(tuple(
+ simplify_terms(new_var_subset, term_set_to_variable)))
+
+ term_set_to_variable[new_var_subset] = var_name
+
+ initializers.append(
+ codegen_state.ast_builder.emit_initializer(
+ codegen_state,
+ codegen_state.kernel.index_dtype,
+ var_name,
+ CExpression(
+ codegen_state,
+ new_var_expr),
+ is_const=True,
+ short_for_expr=old_var_expr))
+
+ term_subset_to_inside_inames_list = \
+ compute_term_subset_to_inside_inames_list(
+ subex_rep_state.term_set_to_inside_inames_list,
+ term_set_to_variable)
+
+ # insert initializer code
+ if initializers:
+ subnode = node.child
+ if isinstance(subnode, Block):
+ subnode = Block(initializers + subnode.contents)
+ else:
+ subnode = Block(initializers+[subnode])
+ node = ScopeASTNode(
+ codegen_state, node.available_variables, subnode)
+
+ subex_rep_state = subex_rep_state.copy(
+ term_set_to_variable=term_set_to_variable,
+ term_subset_to_inside_inames_list=term_subset_to_inside_inames_list)
+
+ return super(ASTSubexpressionReplacer, self).map_loopy_scope(
+ node, subex_rep_state)
+
+ def map_expression(self, expr, subex_rep_state):
+ from pymbolic.primitives import is_constant
+ if isinstance(expr, CExpression):
+ ssr = SubscriptSubsetReplacer(self, subex_rep_state)
+ return CExpression(
+ expr.codegen_state,
+ ssr(expr.expr))
+ elif isinstance(expr, str) or is_constant(expr):
+ return expr
+ else:
+ raise LoopyError(
+ "Unexpected expression type: %s" % type(expr).__name__)
+
+# }}}
+
+
+def eliminate_common_subscripts(codegen_state, node):
+ if not codegen_state.kernel.options.eliminate_common_subscripts:
+ return node
+
+ sc = ASTSubexpressionCollector(codegen_state)
+ sc(node)
+
+ sr = ASTSubexpressionReplacer()
+
+ term_set_to_variable = {}
+ subex_rep_state = SubexpressionReplacementState(
+ codegen_state=codegen_state,
+ name_generator=codegen_state.kernel.get_var_name_generator(),
+ term_set_to_inside_inames_list=sc.term_set_to_inside_inames_list,
+ available_variables=codegen_state.kernel.outer_params(),
+ term_set_to_variable=term_set_to_variable,
+ term_subset_to_inside_inames_list=(
+ compute_term_subset_to_inside_inames_list(
+ sc.term_set_to_inside_inames_list, term_set_to_variable)))
+
+ return sr(node, subex_rep_state)
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 2bdffb5aa69bdc0f72fe12a58faa6d0e78920e0f..8def111be247ed2c3fc689098d48fc4540310933 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -29,11 +29,11 @@ import numpy as np
from pytools import memoize_method
from loopy.target.c import CTarget, CASTBuilder
-from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
+from loopy.target.c.codegen.expression import (
+ CExpressionToCodeMapper)
from loopy.diagnostic import LoopyError
from loopy.types import NumpyType
from loopy.kernel.data import temp_var_scope
-from pymbolic import var
# {{{ vector types
@@ -141,7 +141,7 @@ def cuda_function_mangler(kernel, name, arg_dtypes):
# {{{ expression mapper
-class ExpressionToCudaCExpressionMapper(ExpressionToCExpressionMapper):
+class CUDACExpressionToCodeMapper(CExpressionToCodeMapper):
_GRID_AXES = "xyz"
@staticmethod
@@ -153,15 +153,15 @@ class ExpressionToCudaCExpressionMapper(ExpressionToCExpressionMapper):
else:
raise LoopyError("unexpected index type")
- def map_group_hw_index(self, expr, type_context):
- return var("((%s) blockIdx.%s)" % (
- self._get_index_ctype(self.kernel),
- self._GRID_AXES[expr.axis]))
+ def map_group_hw_index(self, expr, enclosing_prec):
+ return "((%s) blockIdx.%s)" % (
+ self._get_index_ctype(self.codegen_state.kernel),
+ self._GRID_AXES[expr.axis])
- def map_local_hw_index(self, expr, type_context):
- return var("((%s) threadIdx.%s)" % (
- self._get_index_ctype(self.kernel),
- self._GRID_AXES[expr.axis]))
+ def map_local_hw_index(self, expr, enclosing_prec):
+ return "((%s) threadIdx.%s)" % (
+ self._get_index_ctype(self.codegen_state.kernel),
+ self._GRID_AXES[expr.axis])
# }}}
@@ -283,8 +283,8 @@ class CUDACASTBuilder(CASTBuilder):
# {{{ code generation guts
- def get_expression_to_c_expression_mapper(self, codegen_state):
- return ExpressionToCudaCExpressionMapper(codegen_state)
+ def get_c_expression_to_code_mapper(self, codegen_state):
+ return CUDACExpressionToCodeMapper(codegen_state)
_VEC_AXES = "xyzw"
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 80a69bd00c99258b709ea18b2a716c339b888b02..47ae531cbfe7f19eaae2618805fc6411e8bbc0d9 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -27,9 +27,10 @@ THE SOFTWARE.
import numpy as np # noqa
from loopy.target.c import CTarget, CASTBuilder
-from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
+from loopy.target.c.codegen.expression import (
+ ExpressionToCExpressionMapper, CExpressionToCodeMapper)
from loopy.diagnostic import LoopyError
-from loopy.symbolic import Literal
+from loopy.symbolic import Literal, LocalHardwareAxisIndex, CombineMapper
from pymbolic import var
import pymbolic.primitives as p
from pymbolic.mapper.stringifier import PREC_NONE
@@ -37,28 +38,18 @@ from pymbolic.mapper.stringifier import PREC_NONE
from pytools import memoize_method
-# {{{ expression mapper
+# {{{ expression -> C Expression mapper
-class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
- def _get_index_ctype(self):
- if self.kernel.index_dtype.numpy_dtype == np.int32:
- return "int32"
- elif self.kernel.index_dtype.numpy_dtype == np.int64:
- return "int64"
- else:
- raise ValueError("unexpected index_type")
+def _multiply_terms(expr, factor):
+ if isinstance(expr, p.Sum):
+ terms = expr.children
+ else:
+ terms = (expr,)
- def map_group_hw_index(self, expr, type_context):
- return var(
- "((uniform %s) taskIndex%d)"
- % (self._get_index_ctype(), expr.axis))
+ return p.flattened_sum(p.flattened_product((factor, term)) for term in terms)
- def map_local_hw_index(self, expr, type_context):
- if expr.axis == 0:
- return var("(varying %s) programIndex" % self._get_index_ctype())
- else:
- raise LoopyError("ISPC only supports one local axis")
+class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
def map_constant(self, expr, type_context):
if isinstance(expr, (complex, np.complexfloating)):
raise NotImplementedError("complex numbers in ispc")
@@ -88,7 +79,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
# below in decl generation)
gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs()
if lsize:
- return expr[var("programIndex")]
+ return expr[LocalHardwareAxisIndex(0)]
else:
return expr
@@ -114,7 +105,10 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
subscript, = access_info.subscripts
result = var(access_info.array_name)[
- var("programIndex") + self.rec(lsize*subscript, 'i')]
+ LocalHardwareAxisIndex(0)
+ + _multiply_terms(
+ self.rec(subscript, 'i'),
+ lsize)]
if access_info.vector_index is not None:
return self.kernel.target.add_vector_access(
@@ -128,6 +122,25 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
# }}}
+class ISPCExprToCodeMapper(CExpressionToCodeMapper):
+ def _get_index_ctype(self):
+ if self.codegen_state.kernel.index_dtype.numpy_dtype == np.int32:
+ return "int32"
+ elif self.codegen_state.kernel.index_dtype.numpy_dtype == np.int64:
+ return "int64"
+ else:
+ raise ValueError("unexpected index_type")
+
+ def map_group_hw_index(self, expr, enclosing_prec):
+ return "((uniform %s) taskIndex%d)" % (self._get_index_ctype(), expr.axis)
+
+ def map_local_hw_index(self, expr, enclosing_prec):
+ if expr.axis == 0:
+ return "(varying %s) programIndex" % self._get_index_ctype()
+ else:
+ raise LoopyError("ISPC only supports one local axis")
+
+
# {{{ type registry
def fill_registry_with_ispc_types(reg, respect_windows, include_bool=True):
@@ -151,6 +164,30 @@ def fill_registry_with_ispc_types(reg, respect_windows, include_bool=True):
# }}}
+class HasProgramIndexMapper(CombineMapper):
+ def __init__(self, kernel):
+ self.kernel = kernel
+
+ def combine(self, values):
+ return any(values)
+
+ def map_constant(self, expr):
+ return False
+
+ def map_variable(self, expr):
+ from loopy.kernel.data import LocalIndexTagBase
+ return (
+ isinstance(
+ self.kernel.iname_to_tag.get(expr.name, None),
+ LocalIndexTagBase))
+
+ def map_group_hw_index(self, expr):
+ return False
+
+ def map_local_hw_index(self, expr):
+ return True
+
+
class ISPCTarget(CTarget):
"""A code generation target for Intel's `ISPC `_
SPMD programming language, to target Intel's Knight's hardware and modern
@@ -282,6 +319,9 @@ class ISPCASTBuilder(CASTBuilder):
def get_expression_to_c_expression_mapper(self, codegen_state):
return ExprToISPCExprMapper(codegen_state)
+ def get_c_expression_to_code_mapper(self, codegen_state):
+ return ISPCExprToCodeMapper(codegen_state)
+
def add_vector_access(self, access_expr, index):
return access_expr[index]
@@ -475,10 +515,9 @@ class ISPCASTBuilder(CASTBuilder):
lbound, ubound, inner):
ecm = codegen_state.expression_to_code_mapper
- from loopy.target.c import POD
-
+ from loopy.target.c import POD, For
from pymbolic.mapper.stringifier import PREC_NONE
- from cgen import For, InlineInitializer
+ from cgen import InlineInitializer
from cgen.ispc import ISPCUniform
@@ -491,8 +530,36 @@ class ISPCASTBuilder(CASTBuilder):
PREC_NONE, "i"),
"++%s" % iname,
inner)
+
+ def emit_initializer(self, codegen_state, dtype, name, val, is_const,
+ short_for_expr=None):
+ from cgen.ispc import ISPCUniform, ISPCVarying
+ from loopy.target.c import POD
+
+ rhs_has_programindex = False
+ if short_for_expr is not None:
+ rhs_has_programindex = HasProgramIndexMapper(
+ codegen_state.kernel)(short_for_expr)
+
+ decl = POD(self, dtype, name)
+ if rhs_has_programindex:
+ decl = ISPCVarying(decl)
+ else:
+ decl = ISPCUniform(decl)
+
+ from cgen import Initializer, Const
+
+ if is_const:
+ decl = Const(decl)
+
+ return Initializer(decl, val)
+
# }}}
+ def process_ast(self, codegen_state, node):
+ from loopy.target.c.subscript_cse import eliminate_common_subscripts
+ return eliminate_common_subscripts(codegen_state, node)
+
# TODO: Generate launch code
# TODO: Vector types (element access: done)
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 31cf7c6b648ebf370a17d8beb2538b9748ddb30a..d5c2922aa8445d9593786489cdac5f16fe09b77e 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -27,13 +27,12 @@ THE SOFTWARE.
import numpy as np
from loopy.target.c import CTarget, CASTBuilder
-from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
+from loopy.target.c.codegen.expression import CExpressionToCodeMapper
from pytools import memoize_method
from loopy.diagnostic import LoopyError
from loopy.types import NumpyType
from loopy.target.c import DTypeRegistryWrapper
from loopy.kernel.data import temp_var_scope, CallMangleInfo
-from pymbolic import var
# {{{ dtype registry wrappers
@@ -297,12 +296,12 @@ def opencl_preamble_generator(preamble_info):
# {{{ expression mapper
-class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper):
- def map_group_hw_index(self, expr, type_context):
- return var("gid")(expr.axis)
+class CLExpressionToCodeMapper(CExpressionToCodeMapper):
+ def map_group_hw_index(self, expr, enclosing_prec):
+ return "gid(%d)" % expr.axis
- def map_local_hw_index(self, expr, type_context):
- return var("lid")(expr.axis)
+ def map_local_hw_index(self, expr, enclosing_prec):
+ return "lid(%d)" % expr.axis
# }}}
@@ -364,8 +363,6 @@ class OpenCLTarget(CTarget):
# }}}
-# }}}
-
# {{{ ast builder
@@ -438,8 +435,8 @@ class OpenCLCASTBuilder(CASTBuilder):
# {{{ code generation guts
- def get_expression_to_c_expression_mapper(self, codegen_state):
- return ExpressionToOpenCLCExpressionMapper(codegen_state)
+ def get_c_expression_to_code_mapper(self, codegen_state):
+ return CLExpressionToCodeMapper(codegen_state)
def add_vector_access(self, access_expr, index):
# The 'int' avoids an 'L' suffix for long ints.
diff --git a/loopy/target/python.py b/loopy/target/python.py
index 09a86665b7d949d7bf35b910cd2a6fd66109c1ec..7212593f870e5ac3874cbf7c69eca959d57deff6 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -269,7 +269,8 @@ class PythonASTBuilderBase(ASTBuilderBase):
),
inner)
- def emit_initializer(self, codegen_state, dtype, name, val_str, is_const):
+ def emit_initializer(self, codegen_state, dtype, name, val_str, is_const,
+ short_for_expr=None):
from genpy import Assign
return Assign(name, val_str)
diff --git a/test/test_loopy.py b/test/test_loopy.py
index af4269047539b800a5fd389f9293f11551c9a291..e0b2aa94344dee4b65d05dabb78710453624afa6 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -1598,18 +1598,26 @@ def test_missing_global_barrier():
def test_index_cse(ctx_factory):
+ ctx = ctx_factory()
+
knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m