diff --git a/MEMO b/MEMO index c08463c3f0e116f39360b8c3e947b5a513e95943..22fa41ca87c7ff4d86d143aec45d6299cc1d087d 100644 --- a/MEMO +++ b/MEMO @@ -53,10 +53,11 @@ Things to consider - Parallel dimension splitting/merging via tags -- Generalize reduction to be over multiplie variables - - Implement get_problems() +- CSE iname duplication might be unnecessary? + (don't think so: It might be desired to do a full fetch before a mxm k loop + even if that requires going iterative.) Dealt with ^^^^^^^^^^ @@ -65,6 +66,8 @@ Dealt with - Types of reduction variables? +- Generalize reduction to be over multiple variables + How to represent the schedule ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/loopy/__init__.py b/loopy/__init__.py index 7e0474c4d5d5c541aa60b4f9e22354f121598494..d9cedd753b8f4a741a58a43be92c0b4590efed89 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -78,7 +78,7 @@ def split_dimension(kernel, name, inner_length, padded_length=None, .intersect(inner_constraint_set) .project_out(name_dim_type, name_idx, 1)) - new_domain = process_set(kernel.domain) + new_domain = process_set(kernel.domain) new_assumptions = process_set(kernel.assumptions) from pymbolic import var @@ -272,21 +272,21 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non # {{{ build the new instruction's iname_to tag from loopy.symbolic import IndexVariableFinder - insn_iname_to_tag = {} + new_iname_to_tag = {} for old_iname, new_iname in zip(duplicate_inames, new_inames): - insn_iname_to_tag[new_iname] = dup_iname_to_tag[old_iname] + new_iname_to_tag[new_iname] = dup_iname_to_tag[old_iname] index_deps = ( - IndexVariableFinder()(new_outer_expr) + IndexVariableFinder()(new_inner_expr) | set(new_inames)) for iname in index_deps: - if iname not in insn_iname_to_tag: + if iname not in new_iname_to_tag: # assume generating instruction's view on # inames on which we don't have an opinion. if iname in insn.iname_to_tag: - insn_iname_to_tag[iname] = insn.iname_to_tag[iname] + new_iname_to_tag[iname] = insn.iname_to_tag[iname] # }}} @@ -295,7 +295,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non id=kernel.make_unique_instruction_id(based_on=cse_tag), assignee=assignee, expression=new_inner_expr, - iname_to_tag=insn_iname_to_tag, + iname_to_tag=new_iname_to_tag, ) cse_result_insns.append(new_insn) @@ -401,9 +401,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non -def realize_reduction(kernel, loop_iname, dtype, reduction_tag=None): - dtype = np.dtype(dtype) - +def realize_reduction(kernel, loop_iname, reduction_tag=None): new_insns = [] new_temporary_variables = kernel.temporary_variables[:] @@ -415,9 +413,6 @@ def realize_reduction(kernel, loop_iname, dtype, reduction_tag=None): from pymbolic import var - from loopy.kernel import REGISTERED_REDUCTION_OPS - red_op = REGISTERED_REDUCTION_OPS[expr.operation] - target_var_name = kernel.make_unique_var_name("red_"+loop_iname, extra_used_vars=set(tv.name for tv in new_temporary_variables)) target_var = var(target_var_name) @@ -428,35 +423,38 @@ def realize_reduction(kernel, loop_iname, dtype, reduction_tag=None): new_temporary_variables.append( TemporaryVariable( name=target_var_name, - dtype=dtype, + dtype=expr.operation.dtype, shape=(), is_local=False)) init_iname_to_tag = insn.iname_to_tag.copy() + for iname in expr.inames: + del init_iname_to_tag[iname] init_insn = Instruction( id=kernel.make_unique_instruction_id( extra_used_ids=set(ni.id for ni in new_insns)), assignee=target_var, - forced_iname_deps=list(insn.all_inames() - set(expr.iname)), - expression=red_op.neutral_element) + forced_iname_deps=list(insn.all_inames() - set(expr.inames)), + expression=expr.operation.neutral_element, + iname_to_tag=init_iname_to_tag) new_insns.append(init_insn) - from loopy.kernel import SequentialTag reduction_insn = Instruction( id=kernel.make_unique_instruction_id( extra_used_ids=set(ni.id for ni in new_insns)), assignee=target_var, - expression=red_op(target_var, sub_expr), + expression=expr.operation(target_var, sub_expr), forced_insn_deps=[init_insn.id], use_auto_dependencies=False, forced_iname_deps=list(insn.all_inames()), - iname_to_tag={expr.iname: SequentialTag()}) + iname_to_tag=insn.iname_to_tag) new_insns.append(reduction_insn) new_insn_forced_insn_deps.append(reduction_insn.id) + new_insn_removed_inames.extend(expr.inames) return target_var @@ -465,12 +463,20 @@ def realize_reduction(kernel, loop_iname, dtype, reduction_tag=None): for insn in kernel.instructions: new_insn_forced_insn_deps = [] + new_insn_removed_inames = [] + + new_expression = cb_mapper(insn.expression) + + new_iname_to_tag = insn.iname_to_tag.copy() + for iname in new_insn_removed_inames: + del new_iname_to_tag[iname] new_insns.append( insn.copy( - expression=cb_mapper(insn.expression), + expression=new_expression, forced_insn_deps=insn.forced_insn_deps + new_insn_forced_insn_deps, + iname_to_tag=new_iname_to_tag, )) return kernel.copy( diff --git a/loopy/kernel.py b/loopy/kernel.py index da04a654505dd13f47bff745d1b13c62b5ee2675..5df71831ab9c9e3cabaa8f8b838a1c96f32fcbef 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -24,7 +24,8 @@ class IndexTag(Record): class SequentialTag(IndexTag): - pass + def __str__(self): + return "seq" class ParallelTag(IndexTag): pass @@ -232,7 +233,7 @@ class Instruction(Record): def map_reduction(expr, rec): rec(expr.expr) - reduction_inames.add(expr.iname) + reduction_inames.update(expr.inames) ReductionCallbackMapper(map_reduction)(expression) @@ -266,18 +267,10 @@ class Instruction(Record): @memoize_method def all_inames(self): - from loopy.symbolic import VariableIndexExpressionCollector - idx_exprs = ( - VariableIndexExpressionCollector()(self.expression) - | VariableIndexExpressionCollector()(self.assignee)) - from pymbolic.mapper.dependency import DependencyMapper - index_vars = set() - - from pymbolic.primitives import Variable - for idx_expr in idx_exprs: - for i in DependencyMapper()(idx_expr): - if isinstance(i, Variable): - index_vars.add(i.name) + from loopy.symbolic import IndexVariableFinder + index_vars = ( + IndexVariableFinder()(self.expression) + | IndexVariableFinder()(self.assignee)) return index_vars | set(self.forced_iname_deps) @@ -308,7 +301,7 @@ class Instruction(Record): def __str__(self): loop_descrs = [] - for iname in self.all_inames(): + for iname in sorted(self.all_inames()): tag = self.iname_to_tag.get(iname) if tag is None: @@ -325,33 +318,42 @@ class Instruction(Record): # {{{ reduction operations -class ReductionOperation: +class ReductionOperation(object): """ :ivar neutral_element: + :ivar dtype: """ def __call__(self, operand1, operand2): raise NotImplementedError -class SumReductionOperation: +class TypedReductionOperation(ReductionOperation): + def __init__(self, dtype): + self.dtype = dtype + + def __str__(self): + return (type(self).__name__.replace("ReductionOperation", "").lower() + + "_" + str(self.dtype)) + +class SumReductionOperation(TypedReductionOperation): neutral_element = 0 def __call__(self, operand1, operand2): return operand1 + operand2 -class ProductReductionOperation: +class ProductReductionOperation(TypedReductionOperation): neutral_element = 1 def __call__(self, operand1, operand2): return operand1 * operand2 -class FloatingPointMaxOperation: +class FloatingPointMaxOperation(TypedReductionOperation): neutral_element = -var("INFINITY") def __call__(self, operand1, operand2): return var("max")(operand1, operand2) -class FloatingPointMaxOperation: +class FloatingPointMaxOperation(TypedReductionOperation): # OpenCL 1.1, section 6.11.2 neutral_element = -var("INFINITY") @@ -359,7 +361,7 @@ class FloatingPointMaxOperation: from pymbolic.primitives import FunctionSymbol return FunctionSymbol("max")(operand1, operand2) -class FloatingPointMinOperation: +class FloatingPointMinOperation(TypedReductionOperation): # OpenCL 1.1, section 6.11.2 neutral_element = var("INFINITY") @@ -370,29 +372,45 @@ class FloatingPointMinOperation: -REGISTERED_REDUCTION_OPS = { - "sum": SumReductionOperation(), - "product": ProductReductionOperation(), - "fp_max": FloatingPointMaxOperation(), - "fp_min": FloatingPointMinOperation(), +_REDUCTION_OPS = { + "sum": SumReductionOperation, + "product": ProductReductionOperation, + "fpmax": FloatingPointMaxOperation, + "fpmin": FloatingPointMinOperation, } +_REDUCTION_OP_PARSERS = [ + ] -def register_reduction(prefix, op): +def register_reduction_parser(parser): """Register a new :class:`ReductionOperation`. - :arg prefix: the desired name of the operation - :return: the name actually assigned to the operation, - will start with *prefix*. + :arg parser: A function that receives a string and returns + a subclass of ReductionOperation. """ + _REDUCTION_OP_PARSERS.append(parser) + +def parse_reduction_op(name): + import re + red_op_match = re.match(r"^([a-z]+)_([a-z0-9]+)$", name) + if red_op_match: + op_name = red_op_match.group(1) + op_type = red_op_match.group(2) + try: + op_dtype = np.dtype(op_type) + except TypeError: + op_dtype = None + + if op_name in _REDUCTION_OPS and op_dtype is not None: + return _REDUCTION_OPS[op_name](op_dtype) - from loopy.tools import generate_unique_possibilities + for parser in _REDUCTION_OP_PARSERS: + result = parser(name) + if result is not None: + return result - for name in generate_unique_possibilities(prefix): - if name not in REGISTERED_REDUCTION_OPS: - REGISTERED_REDUCTION_OPS[name] = op - return name + raise RuntimeError("could not parse reudction operation '%s'" % name) # }}} diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 69637268c68a4de070d924da134f9e0a8bf3f386..816943f3a3d73dd8c94951b9d33bcccd1a6c1055 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -22,23 +22,29 @@ from islpy import dim_type # {{{ loopy-specific primitives class Reduction(AlgebraicLeaf): - def __init__(self, operation, iname, expr, tag=None): + def __init__(self, operation, inames, expr, tag=None): + assert isinstance(inames, tuple) + + if isinstance(operation, str): + from loopy.kernel import parse_reduction_op + operation = parse_reduction_op(operation) + self.operation = operation - self.iname = iname + self.inames = inames self.expr = expr self.tag = tag def __getinitargs__(self): - return (self.operation, self.iname, self.expr, self.tag) + return (self.operation, self.inames, self.expr, self.tag) def get_hash(self): - return hash((self.__class__, self.operation, self.iname, + return hash((self.__class__, self.operation, self.inames, self.expr, self.tag)) def is_equal(self, other): return (other.__class__ == self.__class__ and other.operation == self.operation - and other.iname == self.iname + and other.inames == self.inames and other.expr == self.expr and other.tag == self.tag) @@ -53,7 +59,7 @@ class Reduction(AlgebraicLeaf): class IdentityMapperMixin(object): def map_reduction(self, expr): - return Reduction(expr.operation, expr.iname, + return Reduction(expr.operation, expr.inames, self.rec(expr.expr), expr.tag) class IdentityMapper(IdentityMapperBase, IdentityMapperMixin): @@ -68,8 +74,8 @@ class SubstitutionMapper(SubstitutionMapperBase, IdentityMapperMixin): class StringifyMapper(StringifyMapperBase): def map_reduction(self, expr, prec): - return "reduce(%s, %s, %s, tag=%s)" % ( - expr.operation, expr.iname, expr.expr, expr.tag) + return "reduce(%s, [%s], %s, tag=%s)" % ( + expr.operation, ", ".join(expr.inames), expr.expr, expr.tag) # }}} @@ -94,10 +100,10 @@ class FunctionToPrimitiveMapper(IdentityMapper): elif isinstance(expr.function, Variable) and expr.function.name == "reduce": if len(expr.parameters) == 3: - operation, iname, red_expr = expr.parameters + operation, inames, red_expr = expr.parameters tag = None elif len(expr.parameters) == 4: - operation, iname, red_expr, tag = expr.parameters + operation, inames, red_expr, tag = expr.parameters else: raise TypeError("reduce takes three or four arguments") @@ -106,15 +112,27 @@ class FunctionToPrimitiveMapper(IdentityMapper): if not isinstance(operation, Variable): raise TypeError("operation argument to reduce() must be a symbol") operation = operation.name - if not isinstance(iname, Variable): - raise TypeError("iname argument to reduce() must be a symbol") - iname = iname.name + if isinstance(inames, Variable): + inames = (inames,) + + if not isinstance(inames, (tuple)): + raise TypeError("iname argument to reduce() must be a symbol " + "or a tuple of symbols") + + processed_inames = [] + for iname in inames: + if not isinstance(iname, Variable): + raise TypeError("iname argument to reduce() must be a symbol " + "or a tuple or a tuple of symbols") + + processed_inames.append(iname.name) + if tag is not None: if not isinstance(tag, Variable): raise TypeError("tag argument to reduce() must be a symbol") tag = tag.name - return Reduction(operation, iname, red_expr, tag) + return Reduction(operation, tuple(processed_inames), red_expr, tag) else: return IdentityMapper.map_call(self, expr) @@ -123,34 +141,18 @@ class FunctionToPrimitiveMapper(IdentityMapper): # {{{ reduction loop splitter class ReductionLoopSplitter(IdentityMapper): - def __init__(self, old_iname, outer_iname, inner_iname, do_warn=True): + def __init__(self, old_iname, outer_iname, inner_iname): self.old_iname = old_iname self.outer_iname = outer_iname self.inner_iname = inner_iname - self.do_warn = do_warn def map_reduction(self, expr): - if expr.iname == self.old_iname: - if self.do_warn: - from warnings import warn - from loopy import LoopyAdvisory - warn("The reduction loop for iname '%s' (with tag '%s') " - "was split into two nested reductions with separate " - "state variables. It might be advantageous to " - "'realize' the reduction first before splitting " - "this loop." % (expr.iname, expr.tag), LoopyAdvisory) - - if expr.tag is not None: - outer_tag = expr.tag + "_outer" - inner_tag = expr.tag + "_inner" - else: - outer_tag = None - inner_tag = None - - return Reduction(expr.operation, self.outer_iname, - Reduction(expr.operation, self.inner_iname, - expr.expr, inner_tag), - outer_tag) + if self.old_iname in expr.inames: + new_inames = expr.inames[:] + new_inames.remove(self.old_iname) + new_inames.extend([self.outer_iname, self.inner_iname]) + return Reduction(expr.operation, new_inames, + expr.expr, expr.tag) else: return IdentityMapper.map_reduction(self, expr) @@ -469,9 +471,24 @@ class IndexVariableFinder(CombineMapper): import operator return reduce(operator.or_, values, set()) + def map_constant(self, expr): + return set() + + def map_algebraic_leaf(self, expr): + return set() + def map_subscript(self, expr): from pymbolic.mapper.dependency import DependencyMapper - return DependencyMapper()(expr) + idx_vars = DependencyMapper()(expr.index) + + from pymbolic.primitives import Variable + result = set() + for idx_var in idx_vars: + if isinstance(idx_var, Variable): + result.add(idx_var.name) + else: + raise RuntimeError("index variable not understood: %s" % idx_var) + return result # }}} diff --git a/test/test_matmul.py b/test/test_matmul.py index b58edc582575aa2f09f41dcbd5eb6a7dde36f7e3..5e5564a143f15e710e220cbd84a42bfd89505720 100644 --- a/test/test_matmul.py +++ b/test/test_matmul.py @@ -203,7 +203,7 @@ def test_plain_matrix_mul_new_ui(ctx_factory): knl = lp.LoopKernel(ctx.devices[0], "[n] -> {[i,j,k]: 0<=i,j,k<n}", [ - "c[i, j] = reduce(sum, k, cse(a[i, k], lhsmat)*cse(b[k, j], rhsmat))" + "c[i, j] = reduce(sum_float32, k, cse(a[i, k], lhsmat)*cse(b[k, j], rhsmat))" ], [ lp.ArrayArg("a", dtype, shape=(n, n), order=order), @@ -220,7 +220,7 @@ def test_plain_matrix_mul_new_ui(ctx_factory): for insn in knl.instructions: print insn print - knl = lp.realize_reduction(knl, "k", dtype) + knl = lp.realize_reduction(knl, "k") for insn in knl.instructions: print insn print