From 58d2ee99cc8c9011562ecfa5de125e965d551ecf Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 24 Oct 2016 23:59:40 -0500 Subject: [PATCH 01/17] Implement index CSE --- loopy/codegen/loop.py | 24 ++- loopy/codegen/result.py | 6 +- loopy/options.py | 16 ++ loopy/target/__init__.py | 5 +- loopy/target/c/__init__.py | 57 ++---- loopy/target/c/compyte | 2 +- loopy/target/c/subscript_cse.py | 333 ++++++++++++++++++++++++++++++++ loopy/target/ispc.py | 14 ++ loopy/target/pyopencl.py | 4 +- loopy/version.py | 2 +- test/test_loopy.py | 10 + 11 files changed, 426 insertions(+), 47 deletions(-) create mode 100644 loopy/target/c/subscript_cse.py diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 648c3fe6f..012f1a8f9 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -148,8 +148,13 @@ def generate_unroll_loop(codegen_state, sched_index): for i in range(length): idx_aff = lower_bound_aff + i new_codegen_state = codegen_state.fix(iname, idx_aff) - result.append( - build_loop_nest(new_codegen_state, sched_index+1)) + inner = build_loop_nest(new_codegen_state, sched_index+1) + inner = inner.with_new_ast( + codegen_state, + codegen_state.ast_builder.emit_scope( + (iname,), + inner.current_ast(codegen_state))) + result.append(inner) return merge_codegen_results(codegen_state, result) @@ -246,7 +251,15 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)] if not hw_inames_left: - return next_func(codegen_state) + hw_inames = tuple(iname + for iname in all_inames_by_insns + if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)) + + inner = next_func(codegen_state) + return inner.with_new_ast( + codegen_state, + codegen_state.ast_builder.emit_scope(hw_inames, + inner.current_ast(codegen_state))) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( insn_ids_for_block) @@ -432,6 +445,11 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): kernel, slab, iname))) inner = build_loop_nest(new_codegen_state, sched_index+1) + inner = inner.with_new_ast( + codegen_state, + codegen_state.ast_builder.emit_scope( + (loop_iname,), + inner.current_ast(codegen_state))) # }}} diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 04fab05af..39960fdcd 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -298,7 +298,7 @@ def generate_host_or_device_program(codegen_state, schedule_index): collapse=False) cur_prog = codegen_result.current_program(codegen_state) - body_ast = cur_prog.ast + body_ast = ast_builder.process_ast(codegen_state, cur_prog.ast) fdecl_ast = ast_builder.get_function_declaration( codegen_state, codegen_result, schedule_index) @@ -309,8 +309,8 @@ def generate_host_or_device_program(codegen_state, schedule_index): codegen_result = codegen_result.with_new_program( codegen_state, cur_prog.copy( - ast=ast_builder.process_ast(fdef_ast), - body_ast=ast_builder.process_ast(body_ast))) + ast=fdef_ast, + body_ast=body_ast)) return codegen_result diff --git a/loopy/options.py b/loopy/options.py index 5db1be646..79186253d 100644 --- a/loopy/options.py +++ b/loopy/options.py @@ -37,6 +37,8 @@ class Options(Record): Unless otherwise specified, these options are Boolean-valued (i.e. on/off). + .. ------------------------------------------------------------------------ + .. rubric:: Code-generation options .. attribute:: annotate_inames @@ -62,6 +64,16 @@ class Options(Record): determining whether an iname duplication is necessary for the kernel to be schedulable. + .. attribute:: eliminate_common_subscripts + + If ``True`` (the default), variable assignments with + precomputed subexpressions of array indices will be + emitted. + + .. versionadded:: 2016.3 + + .. ------------------------------------------------------------------------ + .. rubric:: Invocation-related options .. attribute:: skip_arg_checks @@ -121,6 +133,8 @@ class Options(Record): A :class:`bool`. Whether to allow colors in terminal output + .. ------------------------------------------------------------------------ + .. rubric:: Features .. attribute:: disable_global_barriers @@ -139,6 +153,7 @@ class Options(Record): trace_assignments=False, trace_assignment_values=False, ignore_boostable_into=False, + eliminate_common_subscripts=True, skip_arg_checks=False, no_numpy=False, return_dict=False, write_wrapper=False, highlight_wrapper=False, @@ -163,6 +178,7 @@ class Options(Record): trace_assignments=trace_assignments, trace_assignment_values=trace_assignment_values, ignore_boostable_into=ignore_boostable_into, + eliminate_common_subscripts=eliminate_common_subscripts, skip_arg_checks=skip_arg_checks, no_numpy=no_numpy, return_dict=return_dict, diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 409b9badb..aff1c7310 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -213,6 +213,9 @@ class ASTBuilderBase(object): def emit_if(self, condition_str, ast): raise NotImplementedError() + def emit_scope(self, available_variables, ast): + return ast + def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): raise NotImplementedError() @@ -224,7 +227,7 @@ class ASTBuilderBase(object): # }}} - def process_ast(self, node): + def process_ast(self, kernel, node): return node diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index be1a81bcd..d7c7c92da 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -1,4 +1,5 @@ -"""OpenCL target independent of PyOpenCL.""" +"""Target for C-family languages. Usable for outputting C and as a base \ +for other C-family languages.""" from __future__ import division, absolute_import @@ -30,13 +31,24 @@ import numpy as np # noqa from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder from loopy.diagnostic import LoopyError from cgen import Pointer -from cgen.mapper import IdentityMapper as CASTIdentityMapperBase from pymbolic.mapper.stringifier import PREC_NONE -from loopy.symbolic import IdentityMapper +import cgen from pytools import memoize_method +class ScopeASTNode(cgen.Generable): + def __init__(self, available_variables, child): + self.available_variables = available_variables + self.child = child + + def generate(self): + for i in self.child.generate(): + yield i + + mapper_method = "map_loopy_scope" + + # {{{ dtype registry wrapper class DTypeRegistryWrapper(object): @@ -192,35 +204,6 @@ def generate_array_literal(codegen_state, array, value): # }}} -# {{{ subscript CSE - -class CASTIdentityMapper(CASTIdentityMapperBase): - def map_loopy_pod(self, node, *args, **kwargs): - return type(node)(node.ast_builder, node.dtype, node.name) - - -class SubscriptSubsetCounter(IdentityMapper): - def __init__(self, subset_counters): - self.subset_counters = subset_counters - - -class ASTSubscriptCollector(CASTIdentityMapper): - def __init__(self): - self.subset_counters = {} - - def map_expression(self, expr): - from pymbolic.primitives import is_constant - if isinstance(expr, CExpression) or is_constant(expr): - return expr - elif isinstance(expr, str): - return expr - else: - raise LoopyError( - "Unexpected expression type: %s" % type(expr).__name__) - -# }}} - - # {{{ lazy expression generation class CExpression(object): @@ -720,12 +703,14 @@ class CASTBuilder(ASTBuilderBase): from cgen import If return If(condition_str, ast) + def emit_scope(self, available_variables, ast): + return ScopeASTNode(available_variables, ast) + # }}} - def process_ast(self, node): - sc = ASTSubscriptCollector() - sc(node) - return node + def process_ast(self, codegen_state, node): + from loopy.target.c.subscript_cse import eliminate_common_subscripts + return eliminate_common_subscripts(codegen_state, node) # vim: foldmethod=marker diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte index 11dc00352..ac1c71d46 160000 --- a/loopy/target/c/compyte +++ b/loopy/target/c/compyte @@ -1 +1 @@ -Subproject commit 11dc00352423cddd71f09e809d0a22ab1c3ea7a5 +Subproject commit ac1c71d46428c14aa1bd1c09d7da19cd0298d5cc diff --git a/loopy/target/c/subscript_cse.py b/loopy/target/c/subscript_cse.py new file mode 100644 index 000000000..eb0a3400a --- /dev/null +++ b/loopy/target/c/subscript_cse.py @@ -0,0 +1,333 @@ +"""Common subexpression elimination in array subscripts.""" + +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2016 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six + +from cgen.mapper import IdentityMapper as CASTIdentityMapperBase +import pymbolic.primitives as p +from pytools import Record + +from loopy.symbolic import IdentityMapper as ExprIdentityMapper +from loopy.diagnostic import LoopyError +from loopy.target.c import CExpression + +import logging +logger = logging.getLogger(__name__) + + +# {{{ utilities + +class CASTIdentityMapper(CASTIdentityMapperBase): + def map_loopy_scope(self, node, *args, **kwargs): + return type(node)( + node.available_variables, + self.rec(node.child, *args, **kwargs)) + + def map_loopy_pod(self, node, *args, **kwargs): + return type(node)(node.ast_builder, node.dtype, node.name) + + +def generate_all_subsets(l, min_length): + for bits in range(2**len(l)): + if bin(bits).count("1") >= min_length: + yield frozenset(entry for i, entry in enumerate(l) if (1 << i) & bits) + + +def get_terms(allowable_vars, expr): + if isinstance(expr, p.Sum): + terms = expr.children + else: + terms = (expr,) + + from loopy.symbolic import get_dependencies + from pymbolic.primitives import is_constant + + result = [] + remainder = [] + for term in terms: + if get_dependencies(term) <= allowable_vars and not is_constant(term): + result.append(term) + elif remainder is not None: + remainder.append(term) + + return result, remainder + +# }}} + + +# {{{ counting + +class SubscriptSubsetCounter(ExprIdentityMapper): + def __init__(self, kernel, term_set_to_count): + self.kernel = kernel + self.term_set_to_count = term_set_to_count + + def map_subscript(self, expr): + iname_terms, _ = get_terms(self.kernel.all_inames(), expr.index) + iname_terms = frozenset(iname_terms) + self.term_set_to_count[iname_terms] = \ + self.term_set_to_count.get(iname_terms, 0) + 1 + + +class ASTSubexpressionCollector(CASTIdentityMapper): + def __init__(self, kernel): + self.term_set_to_count = {} + self.subset_count_mapper = SubscriptSubsetCounter( + kernel, self.term_set_to_count) + + def map_expression(self, expr): + from pymbolic.primitives import is_constant + if isinstance(expr, CExpression): + self.subset_count_mapper(expr.expr) + return expr + elif isinstance(expr, str) or is_constant(expr): + return expr + else: + raise LoopyError( + "Unexpected expression type: %s" % type(expr).__name__) + +# }}} + + +# {{{ replacing + +class SubexpressionReplacementState(Record): + """ + .. attribute:: codegen_state + + .. attribute:: name_generator + + A callable that can generate new identifiers. + + .. attribute:: term_set_to_count + + A mapping from (summed) sets of subexpressions to their use counts. + + .. attribute:: term_subset_to_count + + A mapping from (summed) subsets of subexpressions to their use counts. + + .. attribute:: available_variables + + A set of variables that subexpressions may refer to. + + .. attribute:: term_set_to_variable + + A mapping from term subsets to their replacement variable names. + """ + + +def compute_term_subset_to_count(term_set_to_count, term_set_to_variable): + logger.debug("TERM SET TO SUBSET COUNT:") + for term_set, count in six.iteritems(term_set_to_count): + logger.debug(" + ".join(str(i) for i in term_set), count) + + result = {} + for code_term_set, cnt in six.iteritems(term_set_to_count): + logger.debug("CTS:", " + ".join(str(i) for i in code_term_set)) + interacts_with_var_term_sets = [ + var_term_set + for var_term_set in six.iterkeys(term_set_to_variable) + if var_term_set <= code_term_set] + + logger.debug("INTERACTS:", interacts_with_var_term_sets) + for subset in generate_all_subsets(code_term_set, 2): + will_contribute = True + + for var_term_set in interacts_with_var_term_sets: + if (subset <= var_term_set + or (var_term_set & subset + and not var_term_set < subset)): + will_contribute = False + break + + if will_contribute: + result[subset] = result.get(subset, 0) + cnt + + logger.debug("CTS DONE") + + logger.debug("TERM SUBSET TO COUNT:") + for term_set, count in six.iteritems(result): + logger.debug(" + ".join(str(i) for i in term_set), count) + + return result + + +def simplify_terms(terms, term_set_to_variable): + logger.debug("BUILDING EXPR FOR:", "+".join(str(s) for s in terms)) + did_something = True + while did_something: + did_something = False + + for subset, var_name in sorted( + six.iteritems(term_set_to_variable), + # longest first + key=lambda entry: len(entry[0]), reverse=True): + if subset <= terms: + logger.debug("SIMPLIFYING", "+".join(str(s) for s in subset), + "->", var_name) + terms = ( + (terms - subset) + | frozenset([p.Variable(var_name)])) + did_something = True + break + + logger.debug("GOT", "+".join(str(s) for s in terms)) + return terms + + +class SubscriptSubsetReplacer(ExprIdentityMapper): + def __init__(self, node_replacer, subex_rep_state): + self.node_replacer = node_replacer + self.subex_rep_state = subex_rep_state + + def map_subscript(self, expr): + subex_rep_state = self.subex_rep_state + + iname_terms, remainder = get_terms( + subex_rep_state.codegen_state.kernel.all_inames(), + expr.index) + iname_terms = simplify_terms( + frozenset(iname_terms), + subex_rep_state.term_set_to_variable) + + expr = type(expr)( + expr.aggregate, + p.Sum(tuple(iname_terms) + tuple(remainder))) + + return super(SubscriptSubsetReplacer, self).map_subscript(expr) + + +class ASTSubexpressionReplacer(CASTIdentityMapper): + def map_loopy_scope(self, node, subex_rep_state): + codegen_state = subex_rep_state.codegen_state + + available_variables = ( + subex_rep_state.available_variables + | frozenset(node.available_variables)) + subex_rep_state = subex_rep_state.copy( + available_variables=available_variables) + + term_set_to_variable = subex_rep_state.term_set_to_variable.copy() + term_subset_to_count = subex_rep_state.term_subset_to_count + + from loopy.symbolic import get_dependencies + from pytools import argmin2 + from cgen import Block + from loopy.target.c import ScopeASTNode + + initializers = [] + + while True: + eligible_subsets = frozenset( + term_set + for term_set, count in six.iteritems(term_subset_to_count) + if all(get_dependencies(term) <= available_variables + for term in term_set) + if count >= 2) + + if not eligible_subsets: + break + + # find the shortest, most-used subexpression + new_var_subset, _ = argmin2( + ((subset, + (len(subset), -term_subset_to_count[subset])) + for subset in eligible_subsets), + return_value=True) + + var_name = subex_rep_state.name_generator("index_subexp") + + new_var_expr = p.Sum(tuple( + simplify_terms(new_var_subset, term_set_to_variable))) + + term_set_to_variable[new_var_subset] = var_name + + initializers.append( + codegen_state.ast_builder.emit_initializer( + codegen_state, + codegen_state.kernel.index_dtype, + var_name, + CExpression( + subex_rep_state.codegen_state.ast_builder + .get_c_expression_to_code_mapper(), + new_var_expr), + is_const=True)) + + term_subset_to_count = compute_term_subset_to_count( + subex_rep_state.term_set_to_count, + term_set_to_variable) + + # insert initializer code + if initializers: + subnode = node.child + if isinstance(subnode, Block): + subnode = Block(initializers + subnode.contents) + else: + subnode = Block(initializers+[subnode]) + node = ScopeASTNode(node.available_variables, subnode) + + subex_rep_state = subex_rep_state.copy( + term_set_to_variable=term_set_to_variable, + term_subset_to_count=term_subset_to_count) + + return super(ASTSubexpressionReplacer, self).map_loopy_scope( + node, subex_rep_state) + + def map_expression(self, expr, subex_rep_state): + from pymbolic.primitives import is_constant + if isinstance(expr, CExpression): + ssr = SubscriptSubsetReplacer(self, subex_rep_state) + return ssr(expr.expr) + elif isinstance(expr, str) or is_constant(expr): + return expr + else: + raise LoopyError( + "Unexpected expression type: %s" % type(expr).__name__) + +# }}} + + +def eliminate_common_subscripts(codegen_state, node): + if not codegen_state.kernel.options.eliminate_common_subscripts: + return node + + sc = ASTSubexpressionCollector(codegen_state.kernel) + sc(node) + + sr = ASTSubexpressionReplacer() + + term_set_to_variable = {} + subex_rep_state = SubexpressionReplacementState( + codegen_state=codegen_state, + name_generator=codegen_state.kernel.get_var_name_generator(), + term_set_to_count=sc.term_set_to_count, + available_variables=frozenset(), + term_set_to_variable=term_set_to_variable, + term_subset_to_count=compute_term_subset_to_count( + sc.term_set_to_count, term_set_to_variable)) + + return sr(node, subex_rep_state) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 536a186e7..16f6384fa 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -485,6 +485,20 @@ class ISPCASTBuilder(CASTBuilder): PREC_NONE, "i"), "++%s" % iname, inner) + + def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): + from cgen.ispc import ISPCUniform + from loopy.target.c import POD + + decl = ISPCUniform(POD(self, dtype, name)) + + from cgen import Initializer, Const + + if is_const: + decl = Const(decl) + + return Initializer(decl, val_str) + # }}} diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index bdd5773b3..df8c3ea35 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -135,8 +135,8 @@ def check_sizes(kernel, device): from loopy.diagnostic import LoopyAdvisory, LoopyError if device is None: - from loopy.diagnostic import warn - warn(kernel, "no_device_in_pre_codegen_checks", + from loopy.diagnostic import warn_with_kernel + warn_with_kernel(kernel, "no_device_in_pre_codegen_checks", "No device parameter was passed to the PyOpenCLTarget. " "Perhaps you want to pass a device to benefit from " "additional checking.", LoopyAdvisory) diff --git a/loopy/version.py b/loopy/version.py index aa3e7abee..91348ac56 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v44-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v45-islpy%s" % _islpy_version diff --git a/test/test_loopy.py b/test/test_loopy.py index 4af0cdb9e..48dfe0b39 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1399,6 +1399,8 @@ def test_special_instructions(ctx_factory): def test_index_cse(ctx_factory): + ctx = ctx_factory() + knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m Date: Tue, 25 Oct 2016 00:01:19 -0500 Subject: [PATCH 02/17] Track change to process_ast in target --- loopy/target/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index aff1c7310..7acaab062 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -227,7 +227,7 @@ class ASTBuilderBase(object): # }}} - def process_ast(self, kernel, node): + def process_ast(self, codegen_state, node): return node -- GitLab From c8c5e4e0f346e297fabc423f41e9c29d1e4f65ce Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 25 Oct 2016 22:15:13 -0500 Subject: [PATCH 03/17] Fix scope generation for parallel loops --- loopy/codegen/loop.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 012f1a8f9..10dd26106 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -241,18 +241,20 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, from loopy.schedule import get_insn_ids_for_block_at insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index) - if hw_inames_left is None: - all_inames_by_insns = set() + def get_all_inames_by_insns(): + result = set() for insn_id in insn_ids_for_block: - all_inames_by_insns |= kernel.insn_inames(insn_id) + result |= kernel.insn_inames(insn_id) + return result + if hw_inames_left is None: hw_inames_left = [iname - for iname in all_inames_by_insns + for iname in get_all_inames_by_insns() if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)] if not hw_inames_left: hw_inames = tuple(iname - for iname in all_inames_by_insns + for iname in get_all_inames_by_insns() if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)) inner = next_func(codegen_state) -- GitLab From da71abb9ec11375943cec2f66288cf515e116a95 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 25 Oct 2016 22:16:04 -0500 Subject: [PATCH 04/17] Allow using kernel parameters in CSE --- loopy/target/c/compyte | 2 +- loopy/target/c/subscript_cse.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte index ac1c71d46..11dc00352 160000 --- a/loopy/target/c/compyte +++ b/loopy/target/c/compyte @@ -1 +1 @@ -Subproject commit ac1c71d46428c14aa1bd1c09d7da19cd0298d5cc +Subproject commit 11dc00352423cddd71f09e809d0a22ab1c3ea7a5 diff --git a/loopy/target/c/subscript_cse.py b/loopy/target/c/subscript_cse.py index eb0a3400a..04941dc78 100644 --- a/loopy/target/c/subscript_cse.py +++ b/loopy/target/c/subscript_cse.py @@ -84,12 +84,12 @@ class SubscriptSubsetCounter(ExprIdentityMapper): def __init__(self, kernel, term_set_to_count): self.kernel = kernel self.term_set_to_count = term_set_to_count + self.allowable_vars = self.kernel.all_inames() | self.kernel.outer_params() def map_subscript(self, expr): - iname_terms, _ = get_terms(self.kernel.all_inames(), expr.index) - iname_terms = frozenset(iname_terms) - self.term_set_to_count[iname_terms] = \ - self.term_set_to_count.get(iname_terms, 0) + 1 + terms, _ = get_terms(self.allowable_vars, expr.index) + terms = frozenset(terms) + self.term_set_to_count[terms] = self.term_set_to_count.get(terms, 0) + 1 class ASTSubexpressionCollector(CASTIdentityMapper): @@ -325,7 +325,7 @@ def eliminate_common_subscripts(codegen_state, node): codegen_state=codegen_state, name_generator=codegen_state.kernel.get_var_name_generator(), term_set_to_count=sc.term_set_to_count, - available_variables=frozenset(), + available_variables=codegen_state.kernel.outer_params(), term_set_to_variable=term_set_to_variable, term_subset_to_count=compute_term_subset_to_count( sc.term_set_to_count, term_set_to_variable)) -- GitLab From daffcc0192ec735d1d150e9ba06087bc3f2e5531 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 25 Oct 2016 22:37:21 -0500 Subject: [PATCH 05/17] ASTSubexpressionReplacer: Replace CExpression instances with CExpression instances --- loopy/target/c/subscript_cse.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/loopy/target/c/subscript_cse.py b/loopy/target/c/subscript_cse.py index 04941dc78..f92703fe2 100644 --- a/loopy/target/c/subscript_cse.py +++ b/loopy/target/c/subscript_cse.py @@ -301,7 +301,9 @@ class ASTSubexpressionReplacer(CASTIdentityMapper): from pymbolic.primitives import is_constant if isinstance(expr, CExpression): ssr = SubscriptSubsetReplacer(self, subex_rep_state) - return ssr(expr.expr) + return CExpression( + expr.to_code_mapper, + ssr(expr.expr)) elif isinstance(expr, str) or is_constant(expr): return expr else: -- GitLab From 064f9ae097805a3f019a1d51f0030e4866a1d934 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 30 Oct 2016 18:15:14 -0500 Subject: [PATCH 06/17] Make index CSE work on ISPC gnuma example --- loopy/codegen/loop.py | 6 +- loopy/symbolic.py | 40 ++++++++++++- loopy/target/c/__init__.py | 84 +++++++++++++++++++-------- loopy/target/c/codegen/expression.py | 40 ++++++++----- loopy/target/c/compyte | 2 +- loopy/target/c/subscript_cse.py | 68 +++++++++++++++------- loopy/target/cuda.py | 9 +-- loopy/target/ispc.py | 85 ++++++++++++++++++---------- loopy/target/opencl.py | 6 +- 9 files changed, 240 insertions(+), 100 deletions(-) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 10dd26106..099c6bf9e 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -152,6 +152,7 @@ def generate_unroll_loop(codegen_state, sched_index): inner = inner.with_new_ast( codegen_state, codegen_state.ast_builder.emit_scope( + codegen_state, (iname,), inner.current_ast(codegen_state))) result.append(inner) @@ -260,7 +261,9 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, inner = next_func(codegen_state) return inner.with_new_ast( codegen_state, - codegen_state.ast_builder.emit_scope(hw_inames, + codegen_state.ast_builder.emit_scope( + codegen_state, + hw_inames, inner.current_ast(codegen_state))) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( @@ -450,6 +453,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): inner = inner.with_new_ast( codegen_state, codegen_state.ast_builder.emit_scope( + codegen_state, (loop_iname,), inner.current_ast(codegen_state))) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index aa56c14fb..898a10ee6 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -79,6 +79,12 @@ class IdentityMapperMixin(object): def map_loopy_function_identifier(self, expr, *args): return expr + def map_vector_private_subscript(self, expr, *args): + return type(expr)( + self.rec(expr.aggregate, *args), + self.rec(expr.index, *args), + self.rec(expr.vector_width, *args)) + def map_reduction(self, expr, *args): mapped_inames = [self.rec(Variable(iname), *args) for iname in expr.inames] @@ -167,10 +173,13 @@ class StringifyMapper(StringifyMapperBase): return expr.s def map_group_hw_index(self, expr, enclosing_prec): - return "grp.%d" % expr.index + return "grp.%d" % expr.axis def map_local_hw_index(self, expr, enclosing_prec): - return "loc.%d" % expr.index + return "loc.%d" % expr.axis + + def map_vector_private_subscript(self, expr, *args): + return self.rec(expr.aggregate[expr.index][LocalHardwareAxisIndex(0)]) def map_reduction(self, expr, prec): return "%sreduce(%s, [%s], %s)" % ( @@ -238,6 +247,12 @@ class DependencyMapper(DependencyMapperBase): return self.combine( self.rec(child, *args) for child in expr.parameters) + def map_vector_private_subscript(self, expr, *args): + return self.combine(( + self.rec(expr.aggregate), + self.rec(expr.index), + self.rec(expr.vector_width))) + def map_reduction(self, expr): return (self.rec(expr.expr) - set(Variable(iname) for iname in expr.inames)) @@ -339,6 +354,27 @@ class FunctionIdentifier(Leaf): mapper_method = intern("map_loopy_function_identifier") +class VectorPrivateSubscript(Leaf): + """Realize a subscript into a private temporary that needs to be implicitly + indexed by ``local_id(0)``. + """ + + def __init__(self, aggregate, index, vector_width): + self.aggregate = aggregate + self.index = index + self.vector_width = vector_width + + def stringifier(self): + return StringifyMapper + + def __getinitargs__(self): + return (self.aggregate, self.index, self.vector_width) + + init_arg_names = ("aggregate", "index", "vector_width") + + mapper_method = "map_vector_private_subscript" + + class TypedCSE(CommonSubexpression): """A :class:`pymbolic.primitives.CommonSubexpression` annotated with a :class:`numpy.dtype`. diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index d7c7c92da..eb2456338 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -37,18 +37,6 @@ import cgen from pytools import memoize_method -class ScopeASTNode(cgen.Generable): - def __init__(self, available_variables, child): - self.available_variables = available_variables - self.child = child - - def generate(self): - for i in self.child.generate(): - yield i - - mapper_method = "map_loopy_scope" - - # {{{ dtype registry wrapper class DTypeRegistryWrapper(object): @@ -107,7 +95,7 @@ def _preamble_generator(preamble_info): # }}} -# {{{ cgen overrides +# {{{ cgen syntax tree from cgen import Declarator @@ -143,6 +131,44 @@ class POD(Declarator): mapper_method = "map_loopy_pod" + +class ScopeASTNode(cgen.Generable): + def __init__(self, codegen_state, available_variables, child): + self.codegen_state = codegen_state + self.available_variables = available_variables + self.child = child + + def generate(self): + for i in self.child.generate(): + yield i + + mapper_method = "map_loopy_scope" + + +class CgenLoopyLoopMixin(object): + def generate(self): + if self.intro_line() is not None: + yield self.intro_line() + + body = self.body + if isinstance(body, ScopeASTNode): + body = body.child + + from cgen import Block + if isinstance(body, Block): + for line in body.generate(): + yield line + else: + for line in body.generate(): + yield " "+line + + if self.outro_line() is not None: + yield self.outro_line() + + +class For(CgenLoopyLoopMixin, cgen.For): + pass + # }}} @@ -207,12 +233,17 @@ def generate_array_literal(codegen_state, array, value): # {{{ lazy expression generation class CExpression(object): - def __init__(self, to_code_mapper, expr): - self.to_code_mapper = to_code_mapper + def __init__(self, codegen_state, expr): + self.codegen_state = codegen_state self.expr = expr def __str__(self): - return self.to_code_mapper(self.expr, PREC_NONE) + to_code_mapper = \ + self.codegen_state.ast_builder.get_c_expression_to_code_mapper( + self.codegen_state) + return to_code_mapper( + self.expr, + PREC_NONE) # }}} @@ -484,9 +515,9 @@ class CASTBuilder(ASTBuilderBase): return ExpressionToCExpressionMapper( codegen_state, fortran_abi=self.target.fortran_abi) - def get_c_expression_to_code_mapper(self): + def get_c_expression_to_code_mapper(self, codegen_state): from loopy.target.c.codegen.expression import CExpressionToCodeMapper - return CExpressionToCodeMapper() + return CExpressionToCodeMapper(codegen_state) def get_temporary_decl(self, knl, schedule_index, temp_var, decl_info): temp_var_decl = POD(self, decl_info.dtype, decl_info.name) @@ -643,7 +674,9 @@ class CASTBuilder(ASTBuilderBase): if len(mangle_result.result_dtypes) == 0: from cgen import ExpressionStatement return ExpressionStatement( - CExpression(self.get_c_expression_to_code_mapper(), result)) + CExpression( + codegen_state.var_subst_map, + result)) result = ecm.wrap_in_typecast( mangle_result.result_dtypes[0], @@ -655,7 +688,9 @@ class CASTBuilder(ASTBuilderBase): from cgen import Assign return Assign( lhs_code, - CExpression(self.get_c_expression_to_code_mapper(), result)) + CExpression( + codegen_state.var_subst_map, + result)) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, static_lbound, static_ubound, inner): @@ -666,7 +701,7 @@ class CASTBuilder(ASTBuilderBase): from pymbolic import var from pymbolic.primitives import Comparison from pymbolic.mapper.stringifier import PREC_NONE - from cgen import For, InlineInitializer + from cgen import InlineInitializer return For( InlineInitializer( @@ -703,14 +738,15 @@ class CASTBuilder(ASTBuilderBase): from cgen import If return If(condition_str, ast) - def emit_scope(self, available_variables, ast): - return ScopeASTNode(available_variables, ast) + def emit_scope(self, codegen_state, available_variables, ast): + return ScopeASTNode(codegen_state, available_variables, ast) # }}} def process_ast(self, codegen_state, node): from loopy.target.c.subscript_cse import eliminate_common_subscripts - return eliminate_common_subscripts(codegen_state, node) + return eliminate_common_subscripts(codegen_state, + node=node) # vim: foldmethod=marker diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 6d06d8a79..ddbe3c91b 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -108,7 +108,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): assert prec == PREC_NONE from loopy.target.c import CExpression return CExpression( - self.codegen_state.ast_builder.get_c_expression_to_code_mapper(), + self.codegen_state, self.rec(expr, type_context, needed_dtype)) # }}} @@ -118,15 +118,8 @@ class ExpressionToCExpressionMapper(IdentityMapper): return x if expr.name in self.codegen_state.var_subst_map: - if self.kernel.options.annotate_inames: - return var( - "/* %s */ %s" % ( - expr.name, - self.rec(self.codegen_state.var_subst_map[expr.name], - type_context))) - else: - return self.rec(self.codegen_state.var_subst_map[expr.name], - type_context) + return expr + elif expr.name in self.kernel.arg_dict: arg = self.kernel.arg_dict[expr.name] from loopy.kernel.array import ArrayBase @@ -650,10 +643,10 @@ class ExpressionToCExpressionMapper(IdentityMapper): # }}} def map_group_hw_index(self, expr, type_context): - raise LoopyError("plain C does not have group hw axes") + return expr def map_local_hw_index(self, expr, type_context): - raise LoopyError("plain C does not have local hw axes") + return expr # }}} @@ -661,6 +654,9 @@ class ExpressionToCExpressionMapper(IdentityMapper): # {{{ C expression to code mapper class CExpressionToCodeMapper(RecursiveMapper): + def __init__(self, codegen_state): + self.codegen_state = codegen_state + # {{{ helpers def parenthesize_if_needed(self, s, enclosing_prec, my_prec): @@ -698,6 +694,19 @@ class CExpressionToCodeMapper(RecursiveMapper): "entry to loopy") def map_variable(self, expr, enclosing_prec): + if expr.name in self.codegen_state.var_subst_map: + if self.codegen_state.kernel.options.annotate_inames: + return var( + "/* %s */ %s" % ( + expr.name, + self.rec( + self.codegen_state.var_subst_map[expr.name], + enclosing_prec))) + else: + return self.rec( + self.codegen_state.var_subst_map[expr.name], + enclosing_prec) + return expr.name def map_lookup(self, expr, enclosing_prec): @@ -812,8 +821,11 @@ class CExpressionToCodeMapper(RecursiveMapper): self.rec(expr.base, PREC_NONE), self.rec(expr.exponent, PREC_NONE)) - # map_group_hw_index: eliminated at the loopy -> C level - # map_local_hw_index: eliminated at the loopy -> C level + def map_group_hw_index(self, expr, enclosing_prec): + raise LoopyError("plain C does not have group hw axes") + + def map_local_hw_index(self, expr, enclosing_prec): + raise LoopyError("plain C does not have local hw axes") # }}} diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte index 11dc00352..ac1c71d46 160000 --- a/loopy/target/c/compyte +++ b/loopy/target/c/compyte @@ -1 +1 @@ -Subproject commit 11dc00352423cddd71f09e809d0a22ab1c3ea7a5 +Subproject commit ac1c71d46428c14aa1bd1c09d7da19cd0298d5cc diff --git a/loopy/target/c/subscript_cse.py b/loopy/target/c/subscript_cse.py index f92703fe2..46ebd466d 100644 --- a/loopy/target/c/subscript_cse.py +++ b/loopy/target/c/subscript_cse.py @@ -43,6 +43,7 @@ logger = logging.getLogger(__name__) class CASTIdentityMapper(CASTIdentityMapperBase): def map_loopy_scope(self, node, *args, **kwargs): return type(node)( + node.codegen_state, node.available_variables, self.rec(node.child, *args, **kwargs)) @@ -56,7 +57,7 @@ def generate_all_subsets(l, min_length): yield frozenset(entry for i, entry in enumerate(l) if (1 << i) & bits) -def get_terms(allowable_vars, expr): +def get_terms(allowable_vars, is_term_allowed, expr): if isinstance(expr, p.Sum): terms = expr.children else: @@ -68,7 +69,10 @@ def get_terms(allowable_vars, expr): result = [] remainder = [] for term in terms: - if get_dependencies(term) <= allowable_vars and not is_constant(term): + deps = get_dependencies(term) + if (deps <= allowable_vars + and not is_constant(term) + and is_term_allowed(term, deps)): result.append(term) elif remainder is not None: remainder.append(term) @@ -81,22 +85,26 @@ def get_terms(allowable_vars, expr): # {{{ counting class SubscriptSubsetCounter(ExprIdentityMapper): - def __init__(self, kernel, term_set_to_count): - self.kernel = kernel + def __init__(self, codegen_state, term_set_to_count, is_term_allowed): + self.codegen_state = codegen_state self.term_set_to_count = term_set_to_count - self.allowable_vars = self.kernel.all_inames() | self.kernel.outer_params() + kernel = codegen_state.kernel + self.allowable_vars = kernel.all_inames() | kernel.outer_params() + self.is_term_allowed = is_term_allowed def map_subscript(self, expr): - terms, _ = get_terms(self.allowable_vars, expr.index) + terms, _ = get_terms(self.allowable_vars, self.is_term_allowed, expr.index) terms = frozenset(terms) self.term_set_to_count[terms] = self.term_set_to_count.get(terms, 0) + 1 + map_vector_private_subscript = map_subscript + class ASTSubexpressionCollector(CASTIdentityMapper): - def __init__(self, kernel): + def __init__(self, codegen_state, is_term_allowed): self.term_set_to_count = {} self.subset_count_mapper = SubscriptSubsetCounter( - kernel, self.term_set_to_count) + codegen_state, self.term_set_to_count, is_term_allowed) def map_expression(self, expr): from pymbolic.primitives import is_constant @@ -154,7 +162,10 @@ def compute_term_subset_to_count(term_set_to_count, term_set_to_variable): if var_term_set <= code_term_set] logger.debug("INTERACTS:", interacts_with_var_term_sets) - for subset in generate_all_subsets(code_term_set, 2): + for subset in generate_all_subsets(code_term_set, 1): + if len(subset) == 1 and isinstance(six.next(iter(subset)), p.Variable): + continue + will_contribute = True for var_term_set in interacts_with_var_term_sets: @@ -204,15 +215,19 @@ class SubscriptSubsetReplacer(ExprIdentityMapper): self.node_replacer = node_replacer self.subex_rep_state = subex_rep_state - def map_subscript(self, expr): + def _process_subscript(self, expr): subex_rep_state = self.subex_rep_state iname_terms, remainder = get_terms( subex_rep_state.codegen_state.kernel.all_inames(), - expr.index) - iname_terms = simplify_terms( + is_term_allowed=lambda term, deps: True, + expr=expr.index) + return simplify_terms( frozenset(iname_terms), - subex_rep_state.term_set_to_variable) + subex_rep_state.term_set_to_variable), remainder + + def map_subscript(self, expr): + iname_terms, remainder = self._process_subscript(expr) expr = type(expr)( expr.aggregate, @@ -220,10 +235,21 @@ class SubscriptSubsetReplacer(ExprIdentityMapper): return super(SubscriptSubsetReplacer, self).map_subscript(expr) + def map_vector_private_subscript(self, expr): + iname_terms, remainder = self._process_subscript(expr) + + expr = type(expr)( + expr.aggregate, + p.Sum(tuple(iname_terms) + tuple(remainder)), + expr.vector_width) + + return super(SubscriptSubsetReplacer, self)\ + .map_vector_private_subscript(expr) + class ASTSubexpressionReplacer(CASTIdentityMapper): def map_loopy_scope(self, node, subex_rep_state): - codegen_state = subex_rep_state.codegen_state + codegen_state = node.codegen_state available_variables = ( subex_rep_state.available_variables @@ -268,12 +294,11 @@ class ASTSubexpressionReplacer(CASTIdentityMapper): initializers.append( codegen_state.ast_builder.emit_initializer( - codegen_state, + node.codegen_state, codegen_state.kernel.index_dtype, var_name, CExpression( - subex_rep_state.codegen_state.ast_builder - .get_c_expression_to_code_mapper(), + node.codegen_state, new_var_expr), is_const=True)) @@ -288,7 +313,8 @@ class ASTSubexpressionReplacer(CASTIdentityMapper): subnode = Block(initializers + subnode.contents) else: subnode = Block(initializers+[subnode]) - node = ScopeASTNode(node.available_variables, subnode) + node = ScopeASTNode( + node.codegen_state, node.available_variables, subnode) subex_rep_state = subex_rep_state.copy( term_set_to_variable=term_set_to_variable, @@ -302,7 +328,7 @@ class ASTSubexpressionReplacer(CASTIdentityMapper): if isinstance(expr, CExpression): ssr = SubscriptSubsetReplacer(self, subex_rep_state) return CExpression( - expr.to_code_mapper, + expr.codegen_state, ssr(expr.expr)) elif isinstance(expr, str) or is_constant(expr): return expr @@ -313,11 +339,11 @@ class ASTSubexpressionReplacer(CASTIdentityMapper): # }}} -def eliminate_common_subscripts(codegen_state, node): +def eliminate_common_subscripts(codegen_state, is_term_allowed, node): if not codegen_state.kernel.options.eliminate_common_subscripts: return node - sc = ASTSubexpressionCollector(codegen_state.kernel) + sc = ASTSubexpressionCollector(codegen_state, is_term_allowed) sc(node) sr = ASTSubexpressionReplacer() diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 36650e1d2..6601cdbe3 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -29,7 +29,8 @@ import numpy as np from pytools import memoize_method from loopy.target.c import CTarget, CASTBuilder -from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper +from loopy.target.c.codegen.expression import ( + CExpressionToCodeMapper) from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.kernel.data import temp_var_scope @@ -140,7 +141,7 @@ def cuda_function_mangler(kernel, name, arg_dtypes): # {{{ expression mapper -class ExpressionToCudaCExpressionMapper(ExpressionToCExpressionMapper): +class CUDACExpressionToCodeMapper(CExpressionToCodeMapper): _GRID_AXES = "xyz" @staticmethod @@ -278,8 +279,8 @@ class CUDACASTBuilder(CASTBuilder): # {{{ code generation guts - def get_expression_to_c_expression_mapper(self, codegen_state): - return ExpressionToCudaCExpressionMapper(codegen_state) + def get_c_expression_to_code_mapper(self, codegen_state): + return CUDACExpressionToCodeMapper(codegen_state) _VEC_AXES = "xyzw" diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 16f6384fa..377af27c2 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -27,37 +27,19 @@ THE SOFTWARE. import numpy as np # noqa from loopy.target.c import CTarget, CASTBuilder -from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper +from loopy.target.c.codegen.expression import ( + ExpressionToCExpressionMapper, CExpressionToCodeMapper) from loopy.diagnostic import LoopyError -from loopy.symbolic import Literal +from loopy.symbolic import Literal, VectorPrivateSubscript, LocalHardwareAxisIndex from pymbolic import var import pymbolic.primitives as p from pytools import memoize_method -# {{{ expression mapper +# {{{ expression -> C Expression mapper class ExprToISPCExprMapper(ExpressionToCExpressionMapper): - def _get_index_ctype(self): - if self.kernel.index_dtype.numpy_dtype == np.int32: - return "int32" - elif self.kernel.index_dtype.numpy_dtype == np.int64: - return "int64" - else: - raise ValueError("unexpected index_type") - - def map_group_hw_index(self, expr, type_context): - return var( - "((uniform %s) taskIndex%d)" - % (self._get_index_ctype(), expr.axis)) - - def map_local_hw_index(self, expr, type_context): - if expr.axis == 0: - return var("(varying %s) programIndex" % self._get_index_ctype()) - else: - raise LoopyError("ISPC only supports one local axis") - def map_constant(self, expr, type_context): if isinstance(expr, (complex, np.complexfloating)): raise NotImplementedError("complex numbers in ispc") @@ -87,7 +69,8 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): # below in decl generation) gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs() if lsize: - return expr[var("programIndex")] + lsize, = lsize + return VectorPrivateSubscript(expr, 0, lsize) else: return expr @@ -112,8 +95,10 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): self.codegen_state.vectorization_info) subscript, = access_info.subscripts - result = var(access_info.array_name)[ - var("programIndex") + self.rec(lsize*subscript, 'i')] + result = VectorPrivateSubscript( + var(access_info.array_name), + self.rec(subscript, 'i'), + lsize) if access_info.vector_index is not None: return self.kernel.target.add_vector_access( @@ -127,6 +112,32 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): # }}} +class ISPCExprToCodeMapper(CExpressionToCodeMapper): + def _get_index_ctype(self): + if self.codegen_state.kernel.index_dtype.numpy_dtype == np.int32: + return "int32" + elif self.codegen_state.kernel.index_dtype.numpy_dtype == np.int64: + return "int64" + else: + raise ValueError("unexpected index_type") + + def map_group_hw_index(self, expr, enclosing_prec): + return var( + "((uniform %s) taskIndex%d)" + % (self._get_index_ctype(), expr.axis)) + + def map_local_hw_index(self, expr, enclosing_prec): + if expr.axis == 0: + return var("(varying %s) programIndex" % self._get_index_ctype()) + else: + raise LoopyError("ISPC only supports one local axis") + + def map_vector_private_subscript(self, expr, enclosing_prec): + return self.rec(expr.aggregate[ + expr.vector_width * expr.index + LocalHardwareAxisIndex(0)], + enclosing_prec) + + # {{{ type registry def fill_registry_with_ispc_types(reg, respect_windows, include_bool=True): @@ -278,6 +289,9 @@ class ISPCASTBuilder(CASTBuilder): def get_expression_to_c_expression_mapper(self, codegen_state): return ExprToISPCExprMapper(codegen_state) + def get_c_expression_to_code_mapper(self, codegen_state): + return ISPCExprToCodeMapper(codegen_state) + def add_vector_access(self, access_expr, index): return access_expr[index] @@ -469,15 +483,14 @@ class ISPCASTBuilder(CASTBuilder): ecm = codegen_state.expression_to_code_mapper from loopy.symbolic import aff_to_expr - from loopy.target.c import POD + from loopy.target.c import POD, For from pymbolic.mapper.stringifier import PREC_NONE - from cgen import For, Initializer - + from cgen import InlineInitializer from cgen.ispc import ISPCUniform return For( - Initializer( + InlineInitializer( ISPCUniform(POD(self, iname_dtype, iname)), ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), ecm( @@ -501,6 +514,20 @@ class ISPCASTBuilder(CASTBuilder): # }}} + def process_ast(self, codegen_state, node): + knl = codegen_state.kernel + + from loopy.kernel.data import LocalIndexTagBase + + def is_term_allowed(term, dependencies): + return all( + not isinstance( + knl.iname_to_tag.get(dep), LocalIndexTagBase) + for dep in dependencies) + + from loopy.target.c.subscript_cse import eliminate_common_subscripts + return eliminate_common_subscripts(codegen_state, is_term_allowed, node) + # TODO: Generate launch code # TODO: Vector types (element access: done) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 9c2d428b0..270115060 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -27,7 +27,7 @@ THE SOFTWARE. import numpy as np from loopy.target.c import CTarget, CASTBuilder -from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper +from loopy.target.c.codegen.expression import CExpressionToCodeMapper from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType @@ -296,7 +296,7 @@ def opencl_preamble_generator(preamble_info): # {{{ expression mapper -class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper): +class ExpressionToOpenCLCExpressionMapper(CExpressionToCodeMapper): def map_group_hw_index(self, expr, type_context): return var("gid")(expr.axis) @@ -363,8 +363,6 @@ class OpenCLTarget(CTarget): # }}} -# }}} - # {{{ ast builder -- GitLab From 2dea68cbc900b1f1b8d95d891b485a40af22bdf4 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 30 Oct 2016 18:38:50 -0500 Subject: [PATCH 07/17] Fix group/local id codegen for CL/CUDA --- loopy/target/cuda.py | 13 ++++++------- loopy/target/opencl.py | 9 ++++----- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 6601cdbe3..6dc65bfd5 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -34,7 +34,6 @@ from loopy.target.c.codegen.expression import ( from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.kernel.data import temp_var_scope -from pymbolic import var # {{{ vector types @@ -153,15 +152,15 @@ class CUDACExpressionToCodeMapper(CExpressionToCodeMapper): else: raise LoopyError("unexpected index type") - def map_group_hw_index(self, expr, type_context): - return var("((%s) blockIdx.%s)" % ( + def map_group_hw_index(self, expr, enclosing_prec): + return "((%s) blockIdx.%s)" % ( self._get_index_ctype(self.kernel), - self._GRID_AXES[expr.axis])) + self._GRID_AXES[expr.axis]) - def map_local_hw_index(self, expr, type_context): - return var("((%s) threadIdx.%s)" % ( + def map_local_hw_index(self, expr, enclosing_prec): + return "((%s) threadIdx.%s)" % ( self._get_index_ctype(self.kernel), - self._GRID_AXES[expr.axis])) + self._GRID_AXES[expr.axis]) # }}} diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 270115060..67871f1e0 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -33,7 +33,6 @@ from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.target.c import DTypeRegistryWrapper from loopy.kernel.data import temp_var_scope, CallMangleInfo -from pymbolic import var # {{{ dtype registry wrappers @@ -297,11 +296,11 @@ def opencl_preamble_generator(preamble_info): # {{{ expression mapper class ExpressionToOpenCLCExpressionMapper(CExpressionToCodeMapper): - def map_group_hw_index(self, expr, type_context): - return var("gid")(expr.axis) + def map_group_hw_index(self, expr, enclosing_prec): + return "gid(%d)" % expr.axis - def map_local_hw_index(self, expr, type_context): - return var("lid")(expr.axis) + def map_local_hw_index(self, expr, enclosing_prec): + return "lid(%d)" % expr.axi # }}} -- GitLab From a0f0dfc85269851fc5e9017f8034e3f5238f195e Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 30 Oct 2016 20:05:22 -0500 Subject: [PATCH 08/17] More index CSE fixes --- loopy/target/c/__init__.py | 7 ++++--- loopy/target/c/subscript_cse.py | 11 ++++++----- loopy/target/opencl.py | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index eb2456338..2ec789579 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -133,8 +133,8 @@ class POD(Declarator): class ScopeASTNode(cgen.Generable): - def __init__(self, codegen_state, available_variables, child): - self.codegen_state = codegen_state + def __init__(self, var_subst_map, available_variables, child): + self.var_subst_map = var_subst_map self.available_variables = available_variables self.child = child @@ -739,13 +739,14 @@ class CASTBuilder(ASTBuilderBase): return If(condition_str, ast) def emit_scope(self, codegen_state, available_variables, ast): - return ScopeASTNode(codegen_state, available_variables, ast) + return ScopeASTNode(codegen_state.var_subst_map, available_variables, ast) # }}} def process_ast(self, codegen_state, node): from loopy.target.c.subscript_cse import eliminate_common_subscripts return eliminate_common_subscripts(codegen_state, + is_term_allowed=lambda term, deps: True, node=node) diff --git a/loopy/target/c/subscript_cse.py b/loopy/target/c/subscript_cse.py index 46ebd466d..d1f7d1e04 100644 --- a/loopy/target/c/subscript_cse.py +++ b/loopy/target/c/subscript_cse.py @@ -43,7 +43,7 @@ logger = logging.getLogger(__name__) class CASTIdentityMapper(CASTIdentityMapperBase): def map_loopy_scope(self, node, *args, **kwargs): return type(node)( - node.codegen_state, + node.var_subst_map, node.available_variables, self.rec(node.child, *args, **kwargs)) @@ -249,7 +249,8 @@ class SubscriptSubsetReplacer(ExprIdentityMapper): class ASTSubexpressionReplacer(CASTIdentityMapper): def map_loopy_scope(self, node, subex_rep_state): - codegen_state = node.codegen_state + codegen_state = subex_rep_state.codegen_state.copy( + var_subst_map=node.var_subst_map) available_variables = ( subex_rep_state.available_variables @@ -294,11 +295,11 @@ class ASTSubexpressionReplacer(CASTIdentityMapper): initializers.append( codegen_state.ast_builder.emit_initializer( - node.codegen_state, + codegen_state, codegen_state.kernel.index_dtype, var_name, CExpression( - node.codegen_state, + codegen_state, new_var_expr), is_const=True)) @@ -314,7 +315,7 @@ class ASTSubexpressionReplacer(CASTIdentityMapper): else: subnode = Block(initializers+[subnode]) node = ScopeASTNode( - node.codegen_state, node.available_variables, subnode) + codegen_state, node.available_variables, subnode) subex_rep_state = subex_rep_state.copy( term_set_to_variable=term_set_to_variable, diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 67871f1e0..2f533755e 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -430,7 +430,7 @@ class OpenCLCASTBuilder(CASTBuilder): # {{{ code generation guts - def get_expression_to_c_expression_mapper(self, codegen_state): + def get_c_expression_to_code_mapper(self, codegen_state): return ExpressionToOpenCLCExpressionMapper(codegen_state) def add_vector_access(self, access_expr, index): -- GitLab From 7fbdf75bed145edac9ea9219a0cfa8f4002d63a3 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 30 Oct 2016 20:16:54 -0500 Subject: [PATCH 09/17] Fix index CSE typos in OpenCL backend --- loopy/target/opencl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 2f533755e..4236d011e 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -295,12 +295,12 @@ def opencl_preamble_generator(preamble_info): # {{{ expression mapper -class ExpressionToOpenCLCExpressionMapper(CExpressionToCodeMapper): +class CLExpressionToCodeMapper(CExpressionToCodeMapper): def map_group_hw_index(self, expr, enclosing_prec): return "gid(%d)" % expr.axis def map_local_hw_index(self, expr, enclosing_prec): - return "lid(%d)" % expr.axi + return "lid(%d)" % expr.axis # }}} @@ -431,7 +431,7 @@ class OpenCLCASTBuilder(CASTBuilder): # {{{ code generation guts def get_c_expression_to_code_mapper(self, codegen_state): - return ExpressionToOpenCLCExpressionMapper(codegen_state) + return CLExpressionToCodeMapper(codegen_state) def add_vector_access(self, access_expr, index): # The 'int' avoids an 'L' suffix for long ints. -- GitLab From 62fec5cf49b3f5c742f63923aaf36323ac90ea91 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 30 Oct 2016 21:38:35 -0500 Subject: [PATCH 10/17] Fix emit_scope calling seq in Target base class --- loopy/target/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 7acaab062..b2d619dfc 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -213,7 +213,7 @@ class ASTBuilderBase(object): def emit_if(self, condition_str, ast): raise NotImplementedError() - def emit_scope(self, available_variables, ast): + def emit_scope(self, codegen_state, available_variables, ast): return ast def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): -- GitLab From 2f60062800a943aaec85ea5528ffd0854239d525 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 30 Oct 2016 21:57:44 -0500 Subject: [PATCH 11/17] Construct scopes with right var_subst_map for unrolled and seq loops --- loopy/codegen/loop.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 099c6bf9e..cc0183528 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -150,11 +150,11 @@ def generate_unroll_loop(codegen_state, sched_index): new_codegen_state = codegen_state.fix(iname, idx_aff) inner = build_loop_nest(new_codegen_state, sched_index+1) inner = inner.with_new_ast( - codegen_state, + new_codegen_state, codegen_state.ast_builder.emit_scope( - codegen_state, + new_codegen_state, (iname,), - inner.current_ast(codegen_state))) + inner.current_ast(new_codegen_state))) result.append(inner) return merge_codegen_results(codegen_state, result) @@ -451,11 +451,11 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): inner = build_loop_nest(new_codegen_state, sched_index+1) inner = inner.with_new_ast( - codegen_state, + new_codegen_state, codegen_state.ast_builder.emit_scope( - codegen_state, + new_codegen_state, (loop_iname,), - inner.current_ast(codegen_state))) + inner.current_ast(new_codegen_state))) # }}} -- GitLab From 0efb0a299407fcec675060c69dd09ac9de757438 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 30 Oct 2016 22:17:42 -0500 Subject: [PATCH 12/17] Fix CUDA target after index CSE --- loopy/target/cuda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 6dc65bfd5..fa536ad46 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -154,12 +154,12 @@ class CUDACExpressionToCodeMapper(CExpressionToCodeMapper): def map_group_hw_index(self, expr, enclosing_prec): return "((%s) blockIdx.%s)" % ( - self._get_index_ctype(self.kernel), + self._get_index_ctype(self.codegen_state.kernel), self._GRID_AXES[expr.axis]) def map_local_hw_index(self, expr, enclosing_prec): return "((%s) threadIdx.%s)" % ( - self._get_index_ctype(self.kernel), + self._get_index_ctype(self.codegen_state.kernel), self._GRID_AXES[expr.axis]) # }}} -- GitLab From 3c11dd82f8c26c2064d090031cfab118f0d5cba7 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 30 Oct 2016 22:18:23 -0500 Subject: [PATCH 13/17] Fix CExpression creation in multiple assignment --- loopy/target/c/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 2ec789579..cb6fb6c06 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -675,7 +675,7 @@ class CASTBuilder(ASTBuilderBase): from cgen import ExpressionStatement return ExpressionStatement( CExpression( - codegen_state.var_subst_map, + codegen_state, result)) result = ecm.wrap_in_typecast( @@ -689,7 +689,7 @@ class CASTBuilder(ASTBuilderBase): return Assign( lhs_code, CExpression( - codegen_state.var_subst_map, + codegen_state, result)) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, -- GitLab From e85e6706bc7806280422e1831061d4ef2e9490fe Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 16 Nov 2016 12:27:41 -0600 Subject: [PATCH 14/17] PEP8 spacing fix --- loopy/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index fad6e6eab..a8b8f6624 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -448,8 +448,8 @@ def _set_up_default_target(): set_default_target(target) -_set_up_default_target() +_set_up_default_target() # }}} -- GitLab From b3098c00fffce6928d321daa88f3d36d7e87e441 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 16 Nov 2016 12:28:41 -0600 Subject: [PATCH 15/17] Fix logging in subscript CSE --- loopy/target/c/subscript_cse.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/loopy/target/c/subscript_cse.py b/loopy/target/c/subscript_cse.py index d1f7d1e04..37b7ea575 100644 --- a/loopy/target/c/subscript_cse.py +++ b/loopy/target/c/subscript_cse.py @@ -151,17 +151,17 @@ class SubexpressionReplacementState(Record): def compute_term_subset_to_count(term_set_to_count, term_set_to_variable): logger.debug("TERM SET TO SUBSET COUNT:") for term_set, count in six.iteritems(term_set_to_count): - logger.debug(" + ".join(str(i) for i in term_set), count) + logger.debug("%s: %d" % (" + ".join(str(i) for i in term_set), count)) result = {} for code_term_set, cnt in six.iteritems(term_set_to_count): - logger.debug("CTS:", " + ".join(str(i) for i in code_term_set)) + logger.debug("CTS: " + " + ".join(str(i) for i in code_term_set)) interacts_with_var_term_sets = [ var_term_set for var_term_set in six.iterkeys(term_set_to_variable) if var_term_set <= code_term_set] - logger.debug("INTERACTS:", interacts_with_var_term_sets) + logger.debug("INTERACTS: " + str(interacts_with_var_term_sets)) for subset in generate_all_subsets(code_term_set, 1): if len(subset) == 1 and isinstance(six.next(iter(subset)), p.Variable): continue @@ -182,13 +182,13 @@ def compute_term_subset_to_count(term_set_to_count, term_set_to_variable): logger.debug("TERM SUBSET TO COUNT:") for term_set, count in six.iteritems(result): - logger.debug(" + ".join(str(i) for i in term_set), count) + logger.debug("%s: %d" % (" + ".join(str(i) for i in term_set), count)) return result def simplify_terms(terms, term_set_to_variable): - logger.debug("BUILDING EXPR FOR:", "+".join(str(s) for s in terms)) + logger.debug("BUILDING EXPR FOR: " + "+".join(str(s) for s in terms)) did_something = True while did_something: did_something = False @@ -198,15 +198,15 @@ def simplify_terms(terms, term_set_to_variable): # longest first key=lambda entry: len(entry[0]), reverse=True): if subset <= terms: - logger.debug("SIMPLIFYING", "+".join(str(s) for s in subset), - "->", var_name) + logger.debug("SIMPLIFYING " + "+".join(str(s) for s in subset) + + "->" + var_name) terms = ( (terms - subset) | frozenset([p.Variable(var_name)])) did_something = True break - logger.debug("GOT", "+".join(str(s) for s in terms)) + logger.debug("GOT " + "+".join(str(s) for s in terms)) return terms -- GitLab From 3f160ba79b4ea906b4108e65add60a14a87a866d Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 16 Nov 2016 12:30:48 -0600 Subject: [PATCH 16/17] Do index common subexp elimination in ISPC on vector-private indices on [8*x + 8*y + 8*z + programIndex] --- loopy/symbolic.py | 36 -------------- loopy/target/__init__.py | 6 ++- loopy/target/c/__init__.py | 8 ++- loopy/target/c/subscript_cse.py | 36 +++++--------- loopy/target/ispc.py | 88 +++++++++++++++++++++------------ loopy/target/python.py | 3 +- test/test_loopy.py | 9 ++-- 7 files changed, 81 insertions(+), 105 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 898a10ee6..fceba7d20 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -79,12 +79,6 @@ class IdentityMapperMixin(object): def map_loopy_function_identifier(self, expr, *args): return expr - def map_vector_private_subscript(self, expr, *args): - return type(expr)( - self.rec(expr.aggregate, *args), - self.rec(expr.index, *args), - self.rec(expr.vector_width, *args)) - def map_reduction(self, expr, *args): mapped_inames = [self.rec(Variable(iname), *args) for iname in expr.inames] @@ -178,9 +172,6 @@ class StringifyMapper(StringifyMapperBase): def map_local_hw_index(self, expr, enclosing_prec): return "loc.%d" % expr.axis - def map_vector_private_subscript(self, expr, *args): - return self.rec(expr.aggregate[expr.index][LocalHardwareAxisIndex(0)]) - def map_reduction(self, expr, prec): return "%sreduce(%s, [%s], %s)" % ( "simul_" if expr.allow_simultaneous else "", @@ -247,12 +238,6 @@ class DependencyMapper(DependencyMapperBase): return self.combine( self.rec(child, *args) for child in expr.parameters) - def map_vector_private_subscript(self, expr, *args): - return self.combine(( - self.rec(expr.aggregate), - self.rec(expr.index), - self.rec(expr.vector_width))) - def map_reduction(self, expr): return (self.rec(expr.expr) - set(Variable(iname) for iname in expr.inames)) @@ -354,27 +339,6 @@ class FunctionIdentifier(Leaf): mapper_method = intern("map_loopy_function_identifier") -class VectorPrivateSubscript(Leaf): - """Realize a subscript into a private temporary that needs to be implicitly - indexed by ``local_id(0)``. - """ - - def __init__(self, aggregate, index, vector_width): - self.aggregate = aggregate - self.index = index - self.vector_width = vector_width - - def stringifier(self): - return StringifyMapper - - def __getinitargs__(self): - return (self.aggregate, self.index, self.vector_width) - - init_arg_names = ("aggregate", "index", "vector_width") - - mapper_method = "map_vector_private_subscript" - - class TypedCSE(CommonSubexpression): """A :class:`pymbolic.primitives.CommonSubexpression` annotated with a :class:`numpy.dtype`. diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index b2d619dfc..bd69a9d52 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -216,7 +216,8 @@ class ASTBuilderBase(object): def emit_scope(self, codegen_state, available_variables, ast): return ast - def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): + def emit_initializer(self, codegen_state, dtype, name, val_str, is_const, + short_for_expr=None): raise NotImplementedError() def emit_blank_line(self): @@ -283,7 +284,8 @@ class DummyHostASTBuilder(ASTBuilderBase): def emit_if(self, condition_str, ast): return None - def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): + def emit_initializer(self, codegen_state, dtype, name, val_str, is_const, + short_for_expr=None): return None def emit_blank_line(self): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index cb6fb6c06..487b14c6b 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -716,7 +716,8 @@ class CASTBuilder(ASTBuilderBase): "++%s" % iname, inner) - def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): + def emit_initializer(self, codegen_state, dtype, name, val_str, is_const, + short_for_expr=None): decl = POD(self, dtype, name) from cgen import Initializer, Const @@ -745,9 +746,6 @@ class CASTBuilder(ASTBuilderBase): def process_ast(self, codegen_state, node): from loopy.target.c.subscript_cse import eliminate_common_subscripts - return eliminate_common_subscripts(codegen_state, - is_term_allowed=lambda term, deps: True, - node=node) - + return eliminate_common_subscripts(codegen_state, node=node) # vim: foldmethod=marker diff --git a/loopy/target/c/subscript_cse.py b/loopy/target/c/subscript_cse.py index 37b7ea575..e434599db 100644 --- a/loopy/target/c/subscript_cse.py +++ b/loopy/target/c/subscript_cse.py @@ -57,7 +57,7 @@ def generate_all_subsets(l, min_length): yield frozenset(entry for i, entry in enumerate(l) if (1 << i) & bits) -def get_terms(allowable_vars, is_term_allowed, expr): +def get_terms(allowable_vars, expr): if isinstance(expr, p.Sum): terms = expr.children else: @@ -71,8 +71,7 @@ def get_terms(allowable_vars, is_term_allowed, expr): for term in terms: deps = get_dependencies(term) if (deps <= allowable_vars - and not is_constant(term) - and is_term_allowed(term, deps)): + and not is_constant(term)): result.append(term) elif remainder is not None: remainder.append(term) @@ -85,26 +84,23 @@ def get_terms(allowable_vars, is_term_allowed, expr): # {{{ counting class SubscriptSubsetCounter(ExprIdentityMapper): - def __init__(self, codegen_state, term_set_to_count, is_term_allowed): + def __init__(self, codegen_state, term_set_to_count): self.codegen_state = codegen_state self.term_set_to_count = term_set_to_count kernel = codegen_state.kernel self.allowable_vars = kernel.all_inames() | kernel.outer_params() - self.is_term_allowed = is_term_allowed def map_subscript(self, expr): - terms, _ = get_terms(self.allowable_vars, self.is_term_allowed, expr.index) + terms, _ = get_terms(self.allowable_vars, expr.index) terms = frozenset(terms) self.term_set_to_count[terms] = self.term_set_to_count.get(terms, 0) + 1 - map_vector_private_subscript = map_subscript - class ASTSubexpressionCollector(CASTIdentityMapper): - def __init__(self, codegen_state, is_term_allowed): + def __init__(self, codegen_state): self.term_set_to_count = {} self.subset_count_mapper = SubscriptSubsetCounter( - codegen_state, self.term_set_to_count, is_term_allowed) + codegen_state, self.term_set_to_count) def map_expression(self, expr): from pymbolic.primitives import is_constant @@ -220,7 +216,6 @@ class SubscriptSubsetReplacer(ExprIdentityMapper): iname_terms, remainder = get_terms( subex_rep_state.codegen_state.kernel.all_inames(), - is_term_allowed=lambda term, deps: True, expr=expr.index) return simplify_terms( frozenset(iname_terms), @@ -235,17 +230,6 @@ class SubscriptSubsetReplacer(ExprIdentityMapper): return super(SubscriptSubsetReplacer, self).map_subscript(expr) - def map_vector_private_subscript(self, expr): - iname_terms, remainder = self._process_subscript(expr) - - expr = type(expr)( - expr.aggregate, - p.Sum(tuple(iname_terms) + tuple(remainder)), - expr.vector_width) - - return super(SubscriptSubsetReplacer, self)\ - .map_vector_private_subscript(expr) - class ASTSubexpressionReplacer(CASTIdentityMapper): def map_loopy_scope(self, node, subex_rep_state): @@ -288,6 +272,7 @@ class ASTSubexpressionReplacer(CASTIdentityMapper): var_name = subex_rep_state.name_generator("index_subexp") + old_var_expr = p.Sum(tuple(new_var_subset)) new_var_expr = p.Sum(tuple( simplify_terms(new_var_subset, term_set_to_variable))) @@ -301,7 +286,8 @@ class ASTSubexpressionReplacer(CASTIdentityMapper): CExpression( codegen_state, new_var_expr), - is_const=True)) + is_const=True, + short_for_expr=old_var_expr)) term_subset_to_count = compute_term_subset_to_count( subex_rep_state.term_set_to_count, @@ -340,11 +326,11 @@ class ASTSubexpressionReplacer(CASTIdentityMapper): # }}} -def eliminate_common_subscripts(codegen_state, is_term_allowed, node): +def eliminate_common_subscripts(codegen_state, node): if not codegen_state.kernel.options.eliminate_common_subscripts: return node - sc = ASTSubexpressionCollector(codegen_state, is_term_allowed) + sc = ASTSubexpressionCollector(codegen_state) sc(node) sr = ASTSubexpressionReplacer() diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 377af27c2..f9aebd40a 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -30,7 +30,7 @@ from loopy.target.c import CTarget, CASTBuilder from loopy.target.c.codegen.expression import ( ExpressionToCExpressionMapper, CExpressionToCodeMapper) from loopy.diagnostic import LoopyError -from loopy.symbolic import Literal, VectorPrivateSubscript, LocalHardwareAxisIndex +from loopy.symbolic import Literal, LocalHardwareAxisIndex, CombineMapper from pymbolic import var import pymbolic.primitives as p @@ -39,6 +39,15 @@ from pytools import memoize_method # {{{ expression -> C Expression mapper +def _multiply_terms(expr, factor): + if isinstance(expr, p.Sum): + terms = expr.children + else: + terms = (expr,) + + return p.flattened_sum(factor*term for term in terms) + + class ExprToISPCExprMapper(ExpressionToCExpressionMapper): def map_constant(self, expr, type_context): if isinstance(expr, (complex, np.complexfloating)): @@ -69,8 +78,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): # below in decl generation) gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs() if lsize: - lsize, = lsize - return VectorPrivateSubscript(expr, 0, lsize) + return expr[LocalHardwareAxisIndex(0)] else: return expr @@ -95,10 +103,11 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): self.codegen_state.vectorization_info) subscript, = access_info.subscripts - result = VectorPrivateSubscript( - var(access_info.array_name), - self.rec(subscript, 'i'), - lsize) + result = var(access_info.array_name)[ + LocalHardwareAxisIndex(0) + + _multiply_terms( + self.rec(subscript, 'i'), + lsize)] if access_info.vector_index is not None: return self.kernel.target.add_vector_access( @@ -122,21 +131,14 @@ class ISPCExprToCodeMapper(CExpressionToCodeMapper): raise ValueError("unexpected index_type") def map_group_hw_index(self, expr, enclosing_prec): - return var( - "((uniform %s) taskIndex%d)" - % (self._get_index_ctype(), expr.axis)) + return "((uniform %s) taskIndex%d)" % (self._get_index_ctype(), expr.axis) def map_local_hw_index(self, expr, enclosing_prec): if expr.axis == 0: - return var("(varying %s) programIndex" % self._get_index_ctype()) + return "(varying %s) programIndex" % self._get_index_ctype() else: raise LoopyError("ISPC only supports one local axis") - def map_vector_private_subscript(self, expr, enclosing_prec): - return self.rec(expr.aggregate[ - expr.vector_width * expr.index + LocalHardwareAxisIndex(0)], - enclosing_prec) - # {{{ type registry @@ -161,6 +163,30 @@ def fill_registry_with_ispc_types(reg, respect_windows, include_bool=True): # }}} +class HasProgramIndexMapper(CombineMapper): + def __init__(self, kernel): + self.kernel = kernel + + def combine(self, values): + return any(values) + + def map_constant(self, expr): + return False + + def map_variable(self, expr): + from loopy.kernel.data import LocalIndexTagBase + return ( + isinstance( + self.kernel.iname_to_tag.get(expr.name, None), + LocalIndexTagBase)) + + def map_group_hw_index(self, expr): + return False + + def map_local_hw_index(self, expr): + return True + + class ISPCTarget(CTarget): """A code generation target for Intel's `ISPC `_ SPMD programming language, to target Intel's Knight's hardware and modern @@ -499,34 +525,34 @@ class ISPCASTBuilder(CASTBuilder): "++%s" % iname, inner) - def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): - from cgen.ispc import ISPCUniform + def emit_initializer(self, codegen_state, dtype, name, val, is_const, + short_for_expr=None): + from cgen.ispc import ISPCUniform, ISPCVarying from loopy.target.c import POD - decl = ISPCUniform(POD(self, dtype, name)) + rhs_has_programindex = False + if short_for_expr is not None: + rhs_has_programindex = HasProgramIndexMapper( + codegen_state.kernel)(short_for_expr) + + decl = POD(self, dtype, name) + if rhs_has_programindex: + decl = ISPCVarying(decl) + else: + decl = ISPCUniform(decl) from cgen import Initializer, Const if is_const: decl = Const(decl) - return Initializer(decl, val_str) + return Initializer(decl, val) # }}} def process_ast(self, codegen_state, node): - knl = codegen_state.kernel - - from loopy.kernel.data import LocalIndexTagBase - - def is_term_allowed(term, dependencies): - return all( - not isinstance( - knl.iname_to_tag.get(dep), LocalIndexTagBase) - for dep in dependencies) - from loopy.target.c.subscript_cse import eliminate_common_subscripts - return eliminate_common_subscripts(codegen_state, is_term_allowed, node) + return eliminate_common_subscripts(codegen_state, node) # TODO: Generate launch code diff --git a/loopy/target/python.py b/loopy/target/python.py index 591161d81..76bbb28b7 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -240,7 +240,8 @@ class PythonASTBuilderBase(ASTBuilderBase): ), inner) - def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): + def emit_initializer(self, codegen_state, dtype, name, val_str, is_const, + short_for_expr=None): from genpy import Assign return Assign(name, val_str) diff --git a/test/test_loopy.py b/test/test_loopy.py index 48dfe0b39..32ffa2625 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1403,22 +1403,21 @@ def test_index_cse(ctx_factory): knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m Date: Sat, 19 Nov 2016 17:22:09 -0600 Subject: [PATCH 17/17] Tweak subscript CSE: make temporary for single-use-in-inner-loop, tweak 'simplicity' heuristic, generate deterministic code --- loopy/target/c/subscript_cse.py | 165 +++++++++++++++++++++++++------- loopy/target/ispc.py | 2 +- 2 files changed, 131 insertions(+), 36 deletions(-) diff --git a/loopy/target/c/subscript_cse.py b/loopy/target/c/subscript_cse.py index e434599db..0ffdd864a 100644 --- a/loopy/target/c/subscript_cse.py +++ b/loopy/target/c/subscript_cse.py @@ -57,6 +57,14 @@ def generate_all_subsets(l, min_length): yield frozenset(entry for i, entry in enumerate(l) if (1 << i) & bits) +def is_const_product(term): + return ( + p.is_constant(term) + or ( + isinstance(term, p.Product) + and all(is_const_product(ch) for ch in term.children))) + + def get_terms(allowable_vars, expr): if isinstance(expr, p.Sum): terms = expr.children @@ -64,16 +72,15 @@ def get_terms(allowable_vars, expr): terms = (expr,) from loopy.symbolic import get_dependencies - from pymbolic.primitives import is_constant result = [] remainder = [] for term in terms: deps = get_dependencies(term) if (deps <= allowable_vars - and not is_constant(term)): + and not is_const_product(term)): result.append(term) - elif remainder is not None: + else: remainder.append(term) return result, remainder @@ -84,28 +91,54 @@ def get_terms(allowable_vars, expr): # {{{ counting class SubscriptSubsetCounter(ExprIdentityMapper): - def __init__(self, codegen_state, term_set_to_count): + def __init__(self, codegen_state, term_set_to_inside_inames_list, + inside_inames): self.codegen_state = codegen_state - self.term_set_to_count = term_set_to_count + self.term_set_to_inside_inames_list = term_set_to_inside_inames_list kernel = codegen_state.kernel self.allowable_vars = kernel.all_inames() | kernel.outer_params() + self.inside_inames = inside_inames def map_subscript(self, expr): terms, _ = get_terms(self.allowable_vars, expr.index) terms = frozenset(terms) - self.term_set_to_count[terms] = self.term_set_to_count.get(terms, 0) + 1 + self.term_set_to_inside_inames_list[terms] = ( + self.term_set_to_inside_inames_list.get(terms, []) + + [self.inside_inames]) class ASTSubexpressionCollector(CASTIdentityMapper): def __init__(self, codegen_state): - self.term_set_to_count = {} - self.subset_count_mapper = SubscriptSubsetCounter( - codegen_state, self.term_set_to_count) + self.term_set_to_inside_inames_list = {} + self.codegen_state = codegen_state + self.inside_inames_stack = [] + + def map_loopy_scope(self, node): + if self.inside_inames_stack: + new_inside_inames = self.inside_inames_stack[-1] + else: + new_inside_inames = () + + new_inside_inames = ( + new_inside_inames + node.available_variables) + + self.inside_inames_stack.append(new_inside_inames) + result = super(ASTSubexpressionCollector, self).map_loopy_scope(node) + self.inside_inames_stack.pop() + return result def map_expression(self, expr): from pymbolic.primitives import is_constant if isinstance(expr, CExpression): - self.subset_count_mapper(expr.expr) + if self.inside_inames_stack: + inside_inames = self.inside_inames_stack[-1] + else: + inside_inames = () + count_mapper = SubscriptSubsetCounter( + self.codegen_state, + self.term_set_to_inside_inames_list, + inside_inames) + count_mapper(expr.expr) return expr elif isinstance(expr, str) or is_constant(expr): return expr @@ -126,11 +159,12 @@ class SubexpressionReplacementState(Record): A callable that can generate new identifiers. - .. attribute:: term_set_to_count + .. attribute:: term_set_to_inside_inames_list - A mapping from (summed) sets of subexpressions to their use counts. + A mapping from (summed) sets of subexpressions to a list of tuples of inames + within which the use is nested, one per use. - .. attribute:: term_subset_to_count + .. attribute:: term_subset_to_inside_inames_list A mapping from (summed) subsets of subexpressions to their use counts. @@ -144,13 +178,45 @@ class SubexpressionReplacementState(Record): """ -def compute_term_subset_to_count(term_set_to_count, term_set_to_variable): +def is_simple(term): + from loopy.symbolic import HardwareAxisIndex + + if p.is_constant(term): + return True + + if (isinstance(term, p.Variable) + or isinstance(term, HardwareAxisIndex)): + return True + + if isinstance(term, p.Product): + n_constants = 0 + n_simple = 0 + n_other = 0 + + for ch in term.children: + if p.is_constant(ch): + n_constants += 1 + elif is_simple(ch): + n_simple += 1 + else: + n_other += 1 + + return n_other == 0 and n_simple <= 1 + + return False + + +def compute_term_subset_to_inside_inames_list( + term_set_to_inside_inames_list, term_set_to_variable): logger.debug("TERM SET TO SUBSET COUNT:") - for term_set, count in six.iteritems(term_set_to_count): - logger.debug("%s: %d" % (" + ".join(str(i) for i in term_set), count)) + for term_set, in_iname_uses in six.iteritems(term_set_to_inside_inames_list): + logger.debug( + "%s: %d" % (" + ".join(str(i) for i in term_set), + len(in_iname_uses))) result = {} - for code_term_set, cnt in six.iteritems(term_set_to_count): + for code_term_set, in_iname_uses in six.iteritems( + term_set_to_inside_inames_list): logger.debug("CTS: " + " + ".join(str(i) for i in code_term_set)) interacts_with_var_term_sets = [ var_term_set @@ -159,8 +225,10 @@ def compute_term_subset_to_count(term_set_to_count, term_set_to_variable): logger.debug("INTERACTS: " + str(interacts_with_var_term_sets)) for subset in generate_all_subsets(code_term_set, 1): - if len(subset) == 1 and isinstance(six.next(iter(subset)), p.Variable): - continue + if len(subset) == 1: + term, = subset + if is_simple(term): + continue will_contribute = True @@ -172,13 +240,15 @@ def compute_term_subset_to_count(term_set_to_count, term_set_to_variable): break if will_contribute: - result[subset] = result.get(subset, 0) + cnt + result[subset] = result.get(subset, []) + in_iname_uses logger.debug("CTS DONE") logger.debug("TERM SUBSET TO COUNT:") - for term_set, count in six.iteritems(result): - logger.debug("%s: %d" % (" + ".join(str(i) for i in term_set), count)) + for term_set, in_iname_uses in six.iteritems(result): + logger.debug( + "%s: %d" % (" + ".join(str(i) for i in term_set), + len(in_iname_uses))) return result @@ -203,7 +273,11 @@ def simplify_terms(terms, term_set_to_variable): break logger.debug("GOT " + "+".join(str(s) for s in terms)) - return terms + + def term_sort_key(term): + return str(term) + + return sorted(terms, key=term_sort_key) class SubscriptSubsetReplacer(ExprIdentityMapper): @@ -239,38 +313,57 @@ class ASTSubexpressionReplacer(CASTIdentityMapper): available_variables = ( subex_rep_state.available_variables | frozenset(node.available_variables)) + subex_rep_state = subex_rep_state.copy( available_variables=available_variables) term_set_to_variable = subex_rep_state.term_set_to_variable.copy() - term_subset_to_count = subex_rep_state.term_subset_to_count + term_subset_to_inside_inames_list = \ + subex_rep_state.term_subset_to_inside_inames_list from loopy.symbolic import get_dependencies + from pytools import argmin2 from cgen import Block from loopy.target.c import ScopeASTNode initializers = [] + def is_in_deeper_loop(in_iname_uses): + for iiu in in_iname_uses: + iiu = frozenset(iiu) + + if available_variables & iiu < iiu: # note: not equal! + return True + + return False + while True: eligible_subsets = frozenset( term_set - for term_set, count in six.iteritems(term_subset_to_count) + for term_set, in_iname_uses in six.iteritems( + term_subset_to_inside_inames_list) if all(get_dependencies(term) <= available_variables for term in term_set) - if count >= 2) + if len(in_iname_uses) >= 2 # used more than once + or is_in_deeper_loop(in_iname_uses)) if not eligible_subsets: break + def get_name_sort_key(subset): + return (sorted(str(term) for term in subset)) + # find the shortest, most-used subexpression new_var_subset, _ = argmin2( ((subset, - (len(subset), -term_subset_to_count[subset])) + (len(subset), + -len(term_subset_to_inside_inames_list[subset]), + get_name_sort_key(subset))) for subset in eligible_subsets), return_value=True) - var_name = subex_rep_state.name_generator("index_subexp") + var_name = subex_rep_state.name_generator("ind") old_var_expr = p.Sum(tuple(new_var_subset)) new_var_expr = p.Sum(tuple( @@ -289,9 +382,10 @@ class ASTSubexpressionReplacer(CASTIdentityMapper): is_const=True, short_for_expr=old_var_expr)) - term_subset_to_count = compute_term_subset_to_count( - subex_rep_state.term_set_to_count, - term_set_to_variable) + term_subset_to_inside_inames_list = \ + compute_term_subset_to_inside_inames_list( + subex_rep_state.term_set_to_inside_inames_list, + term_set_to_variable) # insert initializer code if initializers: @@ -305,7 +399,7 @@ class ASTSubexpressionReplacer(CASTIdentityMapper): subex_rep_state = subex_rep_state.copy( term_set_to_variable=term_set_to_variable, - term_subset_to_count=term_subset_to_count) + term_subset_to_inside_inames_list=term_subset_to_inside_inames_list) return super(ASTSubexpressionReplacer, self).map_loopy_scope( node, subex_rep_state) @@ -339,10 +433,11 @@ def eliminate_common_subscripts(codegen_state, node): subex_rep_state = SubexpressionReplacementState( codegen_state=codegen_state, name_generator=codegen_state.kernel.get_var_name_generator(), - term_set_to_count=sc.term_set_to_count, + term_set_to_inside_inames_list=sc.term_set_to_inside_inames_list, available_variables=codegen_state.kernel.outer_params(), term_set_to_variable=term_set_to_variable, - term_subset_to_count=compute_term_subset_to_count( - sc.term_set_to_count, term_set_to_variable)) + term_subset_to_inside_inames_list=( + compute_term_subset_to_inside_inames_list( + sc.term_set_to_inside_inames_list, term_set_to_variable))) return sr(node, subex_rep_state) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index be3a766e1..2a931f527 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -46,7 +46,7 @@ def _multiply_terms(expr, factor): else: terms = (expr,) - return p.flattened_sum(factor*term for term in terms) + return p.flattened_sum(p.flattened_product((factor, term)) for term in terms) class ExprToISPCExprMapper(ExpressionToCExpressionMapper): -- GitLab