diff --git a/loopy/__init__.py b/loopy/__init__.py index 189aabd4cd738542da2ff4d6e521c3ed708a783c..9c16200f37f18875f2ccfe6546d6e7cfc2ea0f2a 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -22,8 +22,10 @@ class LoopyAdvisory(UserWarning): from loopy.kernel import ScalarArg, GlobalArg, ArrayArg, ConstantArg, ImageArg from loopy.kernel import (AutoFitLocalIndexTag, get_dot_dependency_graph, - LoopKernel, Instruction) + LoopKernel, Instruction, default_function_mangler, single_arg_function_mangler, + default_preamble_generator) from loopy.creation import make_kernel +from loopy.reduction import register_reduction_parser from loopy.subst import extract_subst, expand_subst from loopy.cse import precompute from loopy.preprocess import preprocess_kernel, realize_reduction @@ -34,7 +36,11 @@ from loopy.check import check_kernels __all__ = ["ScalarArg", "GlobalArg", "ArrayArg", "ConstantArg", "ImageArg", "LoopKernel", - "Instruction", "make_kernel", + "Instruction", + "default_function_mangler", "single_arg_function_mangler", + "default_preamble_generator", + "make_kernel", + "register_reduction_parser", "get_dot_dependency_graph", "preprocess_kernel", "realize_reduction", "generate_loop_schedules", @@ -64,7 +70,7 @@ def split_dimension(kernel, split_iname, inner_length, if split_iname not in kernel.all_inames(): raise ValueError("cannot split loop for unknown variable '%s'" % split_iname) - applied_substitutions = kernel.applied_substitutions[:] + applied_iname_rewrites = kernel.applied_iname_rewrites[:] if outer_iname is None: outer_iname = split_iname+"_outer" @@ -109,7 +115,7 @@ def split_dimension(kernel, split_iname, inner_length, new_insns = [] for insn in kernel.instructions: subst_map = {var(split_iname): new_loop_index} - applied_substitutions.append(subst_map) + applied_iname_rewrites.append(subst_map) from loopy.symbolic import SubstitutionMapper subst_mapper = SubstitutionMapper(subst_map.get) @@ -139,7 +145,7 @@ def split_dimension(kernel, split_iname, inner_length, .copy(domain=new_domain, iname_slab_increments=iname_slab_increments, instructions=new_insns, - applied_substitutions=applied_substitutions, + applied_iname_rewrites=applied_iname_rewrites, )) return tag_dimensions(result, {outer_iname: outer_tag, inner_iname: inner_tag}) @@ -232,7 +238,7 @@ def join_dimensions(kernel, inames, new_iname=None, tag=AutoFitLocalIndexTag()): .map_expressions(subst_map, exclude_instructions=True) .copy( instructions=new_insns, domain=new_domain, - applied_substitutions=kernel.applied_substitutions + [subst_map] + applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_map] )) return tag_dimensions(result, {new_iname: tag}) @@ -376,7 +382,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, if len(si) != arg.dimensions: raise ValueError("sweep index '%s' has the wrong number of dimensions") - for subst_map in kernel.applied_substitutions: + for subst_map in kernel.applied_iname_rewrites: from loopy.symbolic import SubstitutionMapper from pymbolic.mapper.substitutor import make_subst_func si = SubstitutionMapper(make_subst_func(subst_map))(si) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index e4d1f83a18b0d648eae5f121ca476e6a21860cc4..b969d989682b1cdac32e77625f9f73f7facd4e10 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -201,12 +201,16 @@ def generate_code(kernel, with_annotation=False, if var.dtype.kind == "c": allow_complex = True + seen_dtypes = set() + seen_functions = set() + from loopy.codegen.expression import LoopyCCodeMapper - ccm = (LoopyCCodeMapper(kernel, with_annotation=with_annotation, + ccm = (LoopyCCodeMapper(kernel, seen_dtypes, seen_functions, + with_annotation=with_annotation, allow_complex=allow_complex) .copy_and_assign_many(make_initial_assignments(kernel))) - mod = Module() + mod = [] body = Block() @@ -220,7 +224,6 @@ def generate_code(kernel, with_annotation=False, else: return RestrictPointer(arg) - has_double = False has_image = False from loopy.kernel import GlobalArg, ConstantArg, ImageArg, ScalarArg @@ -250,16 +253,8 @@ def generate_code(kernel, with_annotation=False, else: raise ValueError("argument type not understood: '%s'" % type(arg)) - if arg.dtype in [np.float64, np.complex128]: - has_double = True - args.append(arg_decl) - if has_double: - mod.extend([ - Line("#pragma OPENCL EXTENSION cl_khr_fp64: enable"), - Line()]) - if has_image: body.append(Initializer(Const(Value("sampler_t", "loopy_sampler")), "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP " @@ -267,24 +262,6 @@ def generate_code(kernel, with_annotation=False, # }}} - # {{{ handle preambles - - seen_preamble_tags = set() - dedup_preambles = [] - - for tag, preamble in kernel.preambles: - if tag in seen_preamble_tags: - continue - - seen_preamble_tags.add(tag) - dedup_preambles.append(preamble) - - mod.extend( - [LiteralLines(lines) for lines in dedup_preambles] - +[Line()]) - - # }}} - mod.extend([ LiteralLines(r""" #define lid(N) ((int) get_local_id(N)) @@ -357,7 +334,28 @@ def generate_code(kernel, with_annotation=False, from loopy.check import check_implemented_domains assert check_implemented_domains(kernel, gen_code.implemented_domains) - return str(mod) + # {{{ handle preambles + + preambles = kernel.preambles[:] + for prea_gen in kernel.preamble_generators: + preambles.extend(prea_gen(seen_dtypes, seen_functions)) + + seen_preamble_tags = set() + dedup_preambles = [] + + for tag, preamble in sorted(preambles, key=lambda tag_code: tag_code[0]): + if tag in seen_preamble_tags: + continue + + seen_preamble_tags.add(tag) + dedup_preambles.append(preamble) + + mod = ([LiteralLines(lines) for lines in dedup_preambles] + +[Line()] + mod) + + # }}} + + return str(Module(mod)) # }}} diff --git a/loopy/codegen/expression.py b/loopy/codegen/expression.py index d77cdbd2a752d88631bfdb8bb62cc7f94fc1f59e..760962a3d3f1e482105ab97f8bd26207f6ab7ee5 100644 --- a/loopy/codegen/expression.py +++ b/loopy/codegen/expression.py @@ -52,19 +52,21 @@ class TypeInferenceMapper(CombineMapper): def map_call(self, expr): from pymbolic.primitives import Variable - if isinstance(expr.function, Variable): - name = expr.function.name - arg_dtypes = tuple(self.rec(par) for par in expr.parameters) - for rdg in self.kernel.function_result_dtype_getters: - result = rdg(name, arg_dtypes) - if result is not None: - return result - raise RuntimeError("no type inference information on " - "function '%s'" % name) + identifier = expr.function + if isinstance(identifier, Variable): + identifier = identifier.name - else: - return CombineMapper.map_call(self, expr) + arg_dtypes = tuple(self.rec(par) for par in expr.parameters) + + for mangler in self.kernel.function_manglers: + mangle_result = mangler(identifier, arg_dtypes) + if mangle_result is not None: + result_dtype, _ = mangle_result + return result_dtype + + raise RuntimeError("no type inference information on " + "function '%s'" % identifier) def map_variable(self, expr): try: @@ -96,34 +98,45 @@ class TypeInferenceMapper(CombineMapper): return dtype def map_reduction(self, expr): - return expr.operation.dtype(expr.inames) + return expr.operation.result_dtype(self.rec(expr.expr), expr.inames) # }}} # {{{ C code mapper class LoopyCCodeMapper(CCodeMapper): - def __init__(self, kernel, cse_name_list=[], var_subst_map={}, + def __init__(self, kernel, seen_dtypes, seen_functions, var_subst_map={}, with_annotation=False, allow_complex=False): + """ + :arg seen_dtypes: set of dtypes that were encountered + :arg seen_functions: set of tuples (name, c_name, arg_types) indicating + functions that were encountered. + """ - CCodeMapper.__init__(self, cse_name_list=cse_name_list) + CCodeMapper.__init__(self) self.kernel = kernel - self.infer_type = TypeInferenceMapper(kernel) + self.seen_dtypes = seen_dtypes + self.seen_functions = seen_functions + + self.type_inf_mapper = TypeInferenceMapper(kernel) self.allow_complex = allow_complex self.with_annotation = with_annotation self.var_subst_map = var_subst_map.copy() - def copy(self, var_subst_map=None, cse_name_list=None): + def copy(self, var_subst_map=None): if var_subst_map is None: var_subst_map = self.var_subst_map - if cse_name_list is None: - cse_name_list = self.cse_name_list - return LoopyCCodeMapper(self.kernel, - cse_name_list=cse_name_list, var_subst_map=var_subst_map, + return LoopyCCodeMapper(self.kernel, self.seen_dtypes, self.seen_functions, + var_subst_map=var_subst_map, with_annotation=self.with_annotation, allow_complex=self.allow_complex) + def infer_type(self, expr): + result = self.type_inf_mapper(expr) + self.seen_dtypes.add(result) + return result + def copy_and_assign(self, name, value): """Make a copy of self with variable *name* fixed to *value*.""" var_subst_map = self.var_subst_map.copy() @@ -269,6 +282,33 @@ class LoopyCCodeMapper(CCodeMapper): else: return CCodeMapper.map_constant(self, expr, enclosing_prec) + def map_call(self, expr, enclosing_prec): + from pymbolic.primitives import Variable + from pymbolic.mapper.stringifier import PREC_NONE + + identifier = expr.function + + c_name = None + if isinstance(identifier, Variable): + identifier = identifier.name + c_name = identifier + + arg_dtypes = tuple(self.infer_type(par) for par in expr.parameters) + + for mangler in self.kernel.function_manglers: + mangle_result = mangler(identifier, arg_dtypes) + if mangle_result is not None: + result_dtype, c_name = mangle_result + + self.seen_functions.add((identifier, c_name, arg_dtypes)) + + if c_name is None: + raise RuntimeError("unable to find C name for function identifier '%s'" + % identifier) + + return self.format("%s(%s)", + c_name, self.join_rec(", ", expr.parameters, PREC_NONE)) + # {{{ deal with complex-valued variables def complex_type_name(self, dtype): diff --git a/loopy/kernel.py b/loopy/kernel.py index 4bd9750b33e0448ee80f420d9d550ae90144153d..ae40d8dbe4b3c565bfbc0e4c001a4651267dc446 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -6,7 +6,8 @@ import numpy as np from pytools import Record, memoize_method import islpy as isl from islpy import dim_type -from pymbolic import var + +import re @@ -390,241 +391,96 @@ class Instruction(Record): # }}} -# {{{ reduction operations - -class ReductionOperation(object): - def dtype(self, inames): - raise NotImplementedError - - def get_function_result_dtype_getter(self): - """If the reduction declares any functions, return a getter that - makes their return types known to type inference. - """ - return None - - def get_preambles(self, inames): - return [] - - def neutral_element(self, inames): - raise NotImplementedError - - def __call__(self, operand1, operand2): - raise NotImplementedError - -class ScalarReductionOperation(ReductionOperation): - def __init__(self, dtype): - self.scalar_dtype = dtype - - def dtype(self, inames): - return self.scalar_dtype - - def __str__(self): - return (type(self).__name__.replace("ReductionOperation", "").lower() - + "_" + str(self.dtype)) - -class SumReductionOperation(ScalarReductionOperation): - def neutral_element(self, inames): - return 0 - - def __call__(self, operand1, operand2, index): - return operand1 + operand2 - -class ProductReductionOperation(ScalarReductionOperation): - def neutral_element(self, inames): - return 1 - - def __call__(self, operand1, operand2, index): - return operand1 * operand2 - -def get_le_neutral(dtype): - """Return a number y that satisfies (x <= y) for all y.""" - - if dtype.kind == "f": - # OpenCL 1.1, section 6.11.2 - return var("INFINITY") - else: - raise NotImplementedError("less") - -class MaxReductionOperation(ScalarReductionOperation): - def neutral_element(self, inames): - return get_le_neutral(self.dtype) - - def __call__(self, operand1, operand2, index): - return var("max")(operand1, operand2) - -class MinReductionOperation(ScalarReductionOperation): - @property - def neutral_element(self, inames): - return -get_le_neutral(self.dtype) - - def __call__(self, operand1, operand2, index): - return var("min")(operand1, operand2) - - - - -class _ArgExtremumReductionOperation(ReductionOperation): - def __init__(self, dtype): - self.scalar_dtype = dtype - - self.struct_dtype = np.dtype( - [("value", self.scalar_dtype), - ("index", np.int32)]) +# {{{ expand defines - self.prefix = "loopy_arg%s_%s" % (self.which, self.scalar_dtype.type.__name__) - self.type_name = self.prefix + "_result" - from pyopencl.tools import register_dtype - register_dtype(self.struct_dtype, self.type_name, alias_ok=True) +MACRO_RE = re.compile(r"\{([a-zA-Z0-9_]+)\}") - def dtype(self, inames): - return self.struct_dtype +def expand_defines(insn, defines): + macros = set(match.group(1) for match in MACRO_RE.finditer(insn)) - def get_function_result_dtype_getter(self): - names = [self.prefix+"_init", self.prefix+"_update"] - - def getter(name, arg_dtypes): - if name in names: - return self.struct_dtype - - return None - - return getter - - def get_preambles(self, inames): - """Returns a tuple (preamble_key, preamble), where *preamble* is a string - that goes into the kernel preamble, and *preamble_key* is a unique - identifier. No two preambles with the same key will be emitted. - """ - - if len(inames) != 1: - raise RuntimeError("arg%s must be used with exactly one iname" - % self.which) - - from pyopencl.tools import dtype_to_ctype - from pymbolic.mapper.c_code import CCodeMapper - - c_code_mapper = CCodeMapper() - - return [(self.prefix, """ - typedef struct { - %(scalar_type)s value; - int index; - } %(type_name)s; - - inline %(type_name)s %(prefix)s_init() - { - %(type_name)s result; - result.value = %(neutral)s; - result.index = INT_MIN; - return result; - } - - inline %(type_name)s %(prefix)s_update( - %(type_name)s state, %(scalar_type)s op2, int index) - { - %(type_name)s result; - if (op2 %(comp)s state.value) - { - result.value = op2; - result.index = index; - return result; - } - else return state; - } - """ % dict( - type_name=self.type_name, - scalar_type=dtype_to_ctype(self.scalar_dtype), - prefix=self.prefix, - neutral=c_code_mapper( - self.neutral_sign*get_le_neutral(self.scalar_dtype)), - comp=self.update_comparison, - ))] - - def neutral_element(self, inames): - return var(self.prefix+"_init")() - - def __call__(self, operand1, operand2, inames): - iname, = inames - - return var(self.prefix+"_update")( - operand1, operand2, var(iname)) + replacements = [()] + for mac in macros: + value = defines[mac] + if isinstance(value, list): + replacements = [ + rep+(("{%s}" % mac, subval),) + for rep in replacements + for subval in value + ] + else: + replacements = [ + rep+(("{%s}" % mac, value),) + for rep in replacements] -class ArgMaxReductionOperation(_ArgExtremumReductionOperation): - which = "max" - update_comparison = ">=" - neutral_sign = -1 + for rep in replacements: + rep_value = insn + for name, val in rep: + rep_value = rep_value.replace(name, str(val)) -class ArgMinReductionOperation(_ArgExtremumReductionOperation): - which = "min" - update_comparison = "<=" - neutral_sign = +1 + yield rep_value +# }}} +# {{{ function manglers +def default_function_mangler(name, arg_dtypes): + from loopy.reduction import reduction_function_mangler + manglers = [reduction_function_mangler] + for mangler in manglers: + result = mangler(name, arg_dtypes) + if result is not None: + return result + return None -_REDUCTION_OPS = { - "sum": SumReductionOperation, - "product": ProductReductionOperation, - "max": MaxReductionOperation, - "min": MinReductionOperation, - "argmax": ArgMaxReductionOperation, - "argmin": ArgMinReductionOperation, - } +def single_arg_function_mangler(name, arg_dtypes): + if len(arg_dtypes) == 1: + dtype, = arg_dtypes + return dtype, name -_REDUCTION_OP_PARSERS = [ - ] + return None +# }}} -def register_reduction_parser(parser): - """Register a new :class:`ReductionOperation`. +# {{{ preamble generators - :arg parser: A function that receives a string and returns - a subclass of ReductionOperation. - """ - _REDUCTION_OP_PARSERS.append(parser) +def default_preamble_generator(seen_dtypes, seen_functions): + from loopy.reduction import reduction_preamble_generator -def parse_reduction_op(name): - import re - red_op_match = re.match(r"^([a-z]+)_([a-z0-9_]+)$", name) - if red_op_match: - op_name = red_op_match.group(1) - op_type = red_op_match.group(2) + for result in reduction_preamble_generator(seen_dtypes, seen_functions): + yield result - try: - op_dtype = np.dtype(op_type) - except TypeError: - op_dtype = None + has_double = False + has_complex = False - if op_dtype is None and op_type.startswith("vec_"): - import pyopencl.array as cl_array - try: - op_dtype = getattr(cl_array.vec, op_type[4:]) - except AttributeError: - op_dtype = None + for dtype in seen_dtypes: + if dtype in [np.float64, np.complex128]: + has_double = True + if dtype.kind == "c": + has_complex = True - if op_name in _REDUCTION_OPS and op_dtype is not None: - return _REDUCTION_OPS[op_name](op_dtype) + if has_double: + yield ("00_enable_double", """ + #pragma OPENCL EXTENSION cl_khr_fp64: enable + """) - for parser in _REDUCTION_OP_PARSERS: - result = parser(name) - if result is not None: - return result + if has_complex: + if has_double: + yield ("10_include_complex_header", """ + #define PYOPENCL_DEFINE_CDOUBLE - return None + #include <pyopencl-complex.h> + """) + else: + yield ("10_include_complex_header", """ + #include <pyopencl-complex.h> + """) # }}} # {{{ loop kernel object -def _default_get_function_result_dtype(name, arg_dtypes): - if len(arg_dtypes) == 1: - dtype, = arg_dtypes - return dtype - - return None - class LoopKernel(Record): """ :ivar device: :class:`pyopencl.Device` @@ -635,27 +491,39 @@ class LoopKernel(Record): :ivar name: :ivar preambles: a list of (tag, code) tuples that identify preamble snippets. Each tag's snippet is only included once, at its first occurrence. + The preambles will be inserted in order of their tags. + :ivar preamble_generators: a list of functions of signature + (seen_dtypes, seen_functions) where seen_functions is a set of + (name, c_name, arg_dtypes), generating extra entries for `preambles`. :ivar assumptions: the initial implemented_domain, captures assumptions on the parameters. (an isl.Set) - :ivar iname_slab_increments: a dictionary mapping inames to (lower_incr, - upper_incr) tuples that will be separated out in the execution to generate - 'bulk' slabs with fewer conditionals. - :ivar temporary_variables: - :ivar iname_to_tag: :ivar local_sizes: A dictionary from integers to integers, mapping - workgroup axes to ther sizes, e.g. *{0: 16}* forces axis 0 to be + workgroup axes to their sizes, e.g. *{0: 16}* forces axis 0 to be length 16. + :ivar temporary_variables: + :ivar iname_to_tag: :ivar substitutions: a mapping from substitution names to :class:`SubstitutionRule` objects - :ivar lowest_priority_inames: - :ivar breakable_inames: these inames' loops may be broken up by the scheduler - :ivar applied_substitutions: A list of past substitution dictionaries that + :ivar function_manglers: list of functions of signature (name, arg_dtypes) + returning a tuple (result_dtype, function_name), where the function_name + is the C-level function to be called. + :ivar defines: a dictionary of replacements to be made in instructions given + as strings before parsing. A macro instance intended to be replaced should + look like "{MACRO}" in the instruction code. The expansion given in this + parameter is allowed to be a list. In this case, instructions are generated + for *each* combination of macro values. + + The following arguments are not user-facing: + + :ivar iname_slab_increments: a dictionary mapping inames to (lower_incr, + upper_incr) tuples that will be separated out in the execution to generate + 'bulk' slabs with fewer conditionals. + :ivar applied_iname_rewrites: A list of past substitution dictionaries that were applied to the kernel. These are stored so that they may be repeated on expressions the user specifies later. - :ivar function_result_dtype_getters: list of functions of signature (name, arg_dtypes) - returning the result dtype of that function. - :ivar cache_manager: + :ivar lowest_priority_inames: + :ivar breakable_inames: these inames' loops may be broken up by the scheduler The following instance variables are only used until :func:`loopy.make_kernel` is finished: @@ -665,14 +533,22 @@ class LoopKernel(Record): def __init__(self, device, domain, instructions, args=None, schedule=None, name="loopy_kernel", - preambles=[], assumptions=None, - iname_slab_increments={}, - temporary_variables={}, + preambles=[], + preamble_generators=[default_preamble_generator], + assumptions=None, local_sizes={}, - iname_to_tag={}, iname_to_tag_requests=None, substitutions={}, - cache_manager=None, lowest_priority_inames=[], breakable_inames=set(), - applied_substitutions=[], - function_result_dtype_getters=[_default_get_function_result_dtype]): + temporary_variables={}, + iname_to_tag={}, + substitutions={}, + function_manglers=[default_function_mangler, single_arg_function_mangler], + defines={}, + + # non-user-facing + iname_slab_increments={}, + applied_iname_rewrites=[], + cache_manager=None, + iname_to_tag_requests=None, + lowest_priority_inames=[], breakable_inames=set()): """ :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl. Example: "{[i,j]: 0<=i < 10 and 0<= j < 9}" @@ -734,20 +610,7 @@ class LoopKernel(Record): # {{{ instruction parser - def parse_if_necessary(insn): - from loopy.symbolic import parse - - if isinstance(insn, Instruction): - if insn.id is None: - insn = insn.copy(id=self.make_unique_instruction_id(insns)) - insns.append(insn) - return - - if not isinstance(insn, str): - raise TypeError("Instructions must be either an Instruction " - "instance or a parseable string. got '%s' instead." - % type(insn)) - + def parse_insn(insn): insn_match = INSN_RE.match(insn) subst_match = SUBST_RE.match(insn) if insn_match is not None and subst_match is not None: @@ -760,6 +623,7 @@ class LoopKernel(Record): else: raise RuntimeError("insn parse error") + from loopy.symbolic import parse lhs = parse(groups["lhs"]) rhs = parse(groups["rhs"]) @@ -835,6 +699,22 @@ class LoopKernel(Record): arguments=arg_names, expression=rhs) + def parse_if_necessary(insn): + if isinstance(insn, Instruction): + if insn.id is None: + insn = insn.copy(id=self.make_unique_instruction_id(insns)) + insns.append(insn) + return + + if not isinstance(insn, str): + raise TypeError("Instructions must be either an Instruction " + "instance or a parseable string. got '%s' instead." + % type(insn)) + + for insn in expand_defines(insn, defines): + parse_insn(insn) + + # }}} insns = [] @@ -866,6 +746,7 @@ class LoopKernel(Record): schedule=schedule, name=name, preambles=preambles, + preamble_generators=preamble_generators, assumptions=assumptions, iname_slab_increments=iname_slab_increments, temporary_variables=temporary_variables, @@ -876,8 +757,8 @@ class LoopKernel(Record): cache_manager=cache_manager, lowest_priority_inames=lowest_priority_inames, breakable_inames=breakable_inames, - applied_substitutions=applied_substitutions, - function_result_dtype_getters=function_result_dtype_getters) + applied_iname_rewrites=applied_iname_rewrites, + function_manglers=function_manglers) def make_unique_instruction_id(self, insns=None, based_on="insn", extra_used_ids=set()): if insns is None: diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 82a67450efe2ca558d5e7c103dd4fe3139553e9a..c3755f1949c3ac60fe42094ffd61d8e056d8b8d0 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -6,85 +6,103 @@ import pyopencl.characterize as cl_char -# {{{ gather dtype getters from reduction operations +# {{{ infer types -def gather_dtype_getters(kernel): - dtype_getters = kernel.function_result_dtype_getters +def infer_temp_var_type(kernel, tv, type_inf_mapper, debug): + dtypes = [] - def gather_from_reduction(expr, rec): - red_getter = expr.operation.get_function_result_dtype_getter() - if red_getter is not None: - dtype_getters.append(red_getter) + writers = kernel.writer_map()[tv.name] + exprs = [kernel.id_to_insn[w].expression for w in writers] - rec(expr.expr) + from loopy.codegen.expression import DependencyTypeInferenceFailure + for expr in exprs: + try: + if debug: + print " via expr", expr + result = type_inf_mapper(expr) - from loopy.symbolic import ReductionCallbackMapper - rcm = ReductionCallbackMapper(gather_from_reduction) - for insn in kernel.instructions: - rcm(insn.expression) + if debug: + print " result", result - return kernel.copy(function_result_dtype_getters=dtype_getters) + dtypes.append(result) -# }}} + except DependencyTypeInferenceFailure: + if debug: + print " failed" + + if not dtypes: + return None -# {{{ infer types of temporaries + from pytools import is_single_valued + if not is_single_valued(dtypes): + raise RuntimeError("ambiguous type inference for '%s'" + % tv.name) -def infer_types_of_temporaries(kernel): - new_temp_vars = {} + return dtypes[0] + +def infer_types(kernel): + """Infer types on temporaries and reductions.""" + + new_temp_vars = kernel.temporary_variables.copy() + # {{{ fill queue + + # queue contains temporary variables and reductions queue = [] from loopy import infer_type for tv in kernel.temporary_variables.itervalues(): if tv.dtype is infer_type: queue.append(tv) - new_temp_vars[tv.name] = tv - else: - new_temp_vars[tv.name] = tv - from loopy.codegen.expression import ( - TypeInferenceMapper, DependencyTypeInferenceFailure) - tim = TypeInferenceMapper(kernel, new_temp_vars) + # }}} + + from loopy.codegen.expression import TypeInferenceMapper + type_inf_mapper = TypeInferenceMapper(kernel, new_temp_vars) + + # {{{ work on type inference queue + + from loopy.kernel import TemporaryVariable + + debug = 0 first_failure = None while queue: - tv = queue.pop(0) + item = queue.pop(0) - dtypes = [] + if isinstance(item, TemporaryVariable): + if debug: + print "inferring type for tempvar", item.name - writers = kernel.writer_map()[tv.name] - exprs = [kernel.id_to_insn[w].expression for w in writers] + result = infer_temp_var_type(kernel, item, type_inf_mapper, debug) - for expr in exprs: - try: - dtypes.append(tim(expr)) - except DependencyTypeInferenceFailure: - pass + failed = result is None + if not failed: + if debug: + print " success", result + new_temp_vars[item.name] = item.copy(dtype=result) + else: + if debug: + print " failure", result + else: + raise RuntimeError("unexpected item type in type inference") - if not dtypes: - if tv is first_failure: - # this has failed before, give up. - raise RuntimeError("could not determine type of '%s' from expression(s) '%s'" - % (tv.name, - ", ".join(str(e) for e in exprs))) + if failed: + if item is first_failure: + # this item has failed before, give up. + raise RuntimeError("could not determine type of '%s'" % item) if first_failure is None: # remember the first failure for this round through the queue - first_failure = tv + first_failure = item # can't infer type yet, put back into queue - queue.append(tv) - continue - - from pytools import is_single_valued - if not is_single_valued(dtypes): - raise RuntimeError("ambiguous type inference for '%s'" - % tv.name) - - new_temp_vars[tv.name] = tv.copy(dtype=dtypes[0]) + queue.append(item) + else: + # we've made progress, reset failure marker + first_failure = None - # we've made progress, reset failure flag. - first_failure = None + # }}} return kernel.copy(temporary_variables=new_temp_vars) @@ -209,10 +227,12 @@ def realize_reduction(kernel, insn_id_filter=None): new_insns = [] new_temporary_variables = kernel.temporary_variables.copy() - new_preambles = kernel.preambles from loopy.kernel import IlpBaseTag + from loopy.codegen.expression import TypeInferenceMapper + type_inf_mapper = TypeInferenceMapper(kernel) + def map_reduction(expr, rec): # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. @@ -239,8 +259,6 @@ def realize_reduction(kernel, insn_id_filter=None): # }}} - new_preambles.extend(expr.operation.get_preambles(expr.inames)) - from pymbolic import var target_var_name = kernel.make_unique_var_name("acc_"+"_".join(expr.inames), @@ -251,12 +269,14 @@ def realize_reduction(kernel, insn_id_filter=None): target_var = target_var[ tuple(var(ilp_iname) for ilp_iname in ilp_inames)] + arg_dtype = type_inf_mapper(expr.expr) + from loopy.kernel import Instruction from loopy.kernel import TemporaryVariable new_temporary_variables[target_var_name] = TemporaryVariable( name=target_var_name, - dtype=expr.operation.dtype(expr.inames), + dtype=expr.operation.result_dtype(arg_dtype, expr.inames), shape=tuple(ilp_iname_lengths), is_local=False) @@ -268,7 +288,7 @@ def realize_reduction(kernel, insn_id_filter=None): id=new_id, assignee=target_var, forced_iname_deps=temp_kernel.insn_inames(insn) - set(expr.inames), - expression=expr.operation.neutral_element(expr.inames)) + expression=expr.operation.neutral_element(arg_dtype, expr.inames)) generated_insns.append(init_insn) @@ -279,7 +299,7 @@ def realize_reduction(kernel, insn_id_filter=None): reduction_insn = Instruction( id=new_id, assignee=target_var, - expression=expr.operation(target_var, expr.expr, expr.inames), + expression=expr.operation(arg_dtype, target_var, expr.expr, expr.inames), insn_deps=set([init_insn.id]) | insn.insn_deps, forced_iname_deps=temp_kernel.insn_inames(insn) | set(expr.inames)) @@ -783,18 +803,19 @@ def adjust_local_temp_var_storage(kernel): def preprocess_kernel(kernel): - kernel = gather_dtype_getters(kernel) - # all type inference must happen *after* this point (because only then all - # the functions return dtype getters are available.) - kernel = infer_types_of_temporaries(kernel) + kernel = infer_types(kernel) - from loopy.subst import expand_subst - kernel = expand_subst(kernel) + # Ordering restriction: + # realize_reduction must happen after type inference because it needs + # to be able to determine the types of the reduced expressions. kernel = realize_reduction(kernel) + from loopy.subst import expand_subst + kernel = expand_subst(kernel) + # Ordering restriction: # Must realize reductions before realizing ILP, because realize_ilp() # gets rid of ILP tags, but realize_reduction() needs them to do diff --git a/loopy/reduction.py b/loopy/reduction.py new file mode 100644 index 0000000000000000000000000000000000000000..8b687cfc811b01033db07aa1f73215a10fd2eb63 --- /dev/null +++ b/loopy/reduction.py @@ -0,0 +1,249 @@ +from pymbolic import var +import numpy as np + +from loopy.symbolic import FunctionIdentifier + + + + +class ReductionOperation(object): + def result_dtype(self, arg_dtype, inames): + raise NotImplementedError + + def neutral_element(self, dtype, inames): + raise NotImplementedError + + def __call__(self, dtype, operand1, operand2, inames): + raise NotImplementedError + +class ScalarReductionOperation(ReductionOperation): + def __init__(self, forced_result_dtype=None): + """ + :arg forced_result_dtype: Force the reduction result to be of this type. + """ + self.forced_result_dtype = forced_result_dtype + + def result_dtype(self, arg_dtype, inames): + if self.forced_result_dtype is not None: + return self.forced_result_dtype + + return arg_dtype + + def __str__(self): + result = type(self).__name__.replace("ReductionOperation", "").lower() + + if self.forced_result_dtype is not None: + result = "%s<%s>" % (result, str(self.dtype)) + + return result + +class SumReductionOperation(ScalarReductionOperation): + def neutral_element(self, dtype, inames): + return 0 + + def __call__(self, dtype, operand1, operand2, inames): + return operand1 + operand2 + +class ProductReductionOperation(ScalarReductionOperation): + def neutral_element(self, dtype, inames): + return 1 + + def __call__(self, dtype, operand1, operand2, inames): + return operand1 * operand2 + +def get_le_neutral(dtype): + """Return a number y that satisfies (x <= y) for all y.""" + + if dtype.kind == "f": + # OpenCL 1.1, section 6.11.2 + return var("INFINITY") + else: + raise NotImplementedError("less") + +class MaxReductionOperation(ScalarReductionOperation): + def neutral_element(self, dtype, inames): + return get_le_neutral(dtype) + + def __call__(self, dtype, operand1, operand2, inames): + return var("max")(operand1, operand2) + +class MinReductionOperation(ScalarReductionOperation): + @property + def neutral_element(self, dtype, inames): + return -get_le_neutral(dtype) + + def __call__(self, dtype, operand1, operand2, inames): + return var("min")(operand1, operand2) + + + + +# {{{ argmin/argmax + +ARGEXT_STRUCT_DTYPES = {} + +class _ArgExtremumReductionOperation(ReductionOperation): + def prefix(self, dtype): + return "loopy_arg%s_%s" % (self.which, dtype.type.__name__) + + def result_dtype(self, dtype, inames): + try: + return ARGEXT_STRUCT_DTYPES[dtype] + except KeyError: + struct_dtype = np.dtype([("value", dtype), ("index", np.int32)]) + ARGEXT_STRUCT_DTYPES[dtype] = struct_dtype + + from pyopencl.tools import register_dtype + register_dtype(struct_dtype, self.prefix(dtype)+"_result", alias_ok=True) + return struct_dtype + + def neutral_element(self, dtype, inames): + return ArgExtFunction(self, dtype, "init", inames)() + + def __call__(self, dtype, operand1, operand2, inames): + iname, = inames + + return ArgExtFunction(self, dtype, "update", inames)( + operand1, operand2, var(iname)) + +class ArgMaxReductionOperation(_ArgExtremumReductionOperation): + which = "max" + update_comparison = ">=" + neutral_sign = -1 + +class ArgMinReductionOperation(_ArgExtremumReductionOperation): + which = "min" + update_comparison = "<=" + neutral_sign = +1 + +class ArgExtFunction(FunctionIdentifier): + def __init__(self, reduction_op, scalar_dtype, name, inames): + self.reduction_op = reduction_op + self.scalar_dtype = scalar_dtype + self.name = name + self.inames = inames + +def get_argext_preamble(func_id): + op = func_id.reduction_op + prefix = op.prefix(func_id.scalar_dtype) + + from pyopencl.tools import dtype_to_ctype + from pymbolic.mapper.c_code import CCodeMapper + + c_code_mapper = CCodeMapper() + + return (prefix, """ + typedef struct { + %(scalar_type)s value; + int index; + } %(type_name)s; + + inline %(type_name)s %(prefix)s_init() + { + %(type_name)s result; + result.value = %(neutral)s; + result.index = INT_MIN; + return result; + } + + inline %(type_name)s %(prefix)s_update( + %(type_name)s state, %(scalar_type)s op2, int index) + { + %(type_name)s result; + if (op2 %(comp)s state.value) + { + result.value = op2; + result.index = index; + return result; + } + else return state; + } + """ % dict( + type_name=prefix+"_result", + scalar_type=dtype_to_ctype(func_id.scalar_dtype), + prefix=prefix, + neutral=c_code_mapper( + op.neutral_sign*get_le_neutral(func_id.scalar_dtype)), + comp=op.update_comparison, + )) + +# }} + + + + +# {{{ reduction op registry + +_REDUCTION_OPS = { + "sum": SumReductionOperation, + "product": ProductReductionOperation, + "max": MaxReductionOperation, + "min": MinReductionOperation, + "argmax": ArgMaxReductionOperation, + "argmin": ArgMinReductionOperation, + } + +_REDUCTION_OP_PARSERS = [ + ] + + +def register_reduction_parser(parser): + """Register a new :class:`ReductionOperation`. + + :arg parser: A function that receives a string and returns + a subclass of ReductionOperation. + """ + _REDUCTION_OP_PARSERS.append(parser) + +def parse_reduction_op(name): + import re + + red_op_match = re.match(r"^([a-z]+)_([a-z0-9_]+)$", name) + if red_op_match: + op_name = red_op_match.group(1) + op_type = red_op_match.group(2) + + try: + op_dtype = np.dtype(op_type) + except TypeError: + op_dtype = None + + if op_dtype is None and op_type.startswith("vec_"): + import pyopencl.array as cl_array + try: + op_dtype = getattr(cl_array.vec, op_type[4:]) + except AttributeError: + op_dtype = None + + if op_name in _REDUCTION_OPS and op_dtype is not None: + return _REDUCTION_OPS[op_name](op_dtype) + + if name in _REDUCTION_OPS: + return _REDUCTION_OPS[name]() + + for parser in _REDUCTION_OP_PARSERS: + result = parser(name) + if result is not None: + return result + + return None + +# }}} + + + + +def reduction_function_mangler(func_id, arg_dtypes): + if isinstance(func_id, ArgExtFunction): + op = func_id.reduction_op + return (op.result_dtype(func_id.scalar_dtype, func_id.inames), + "%s_%s" % (op.prefix(func_id.scalar_dtype), func_id.name)) + + return None + +def reduction_preamble_generator(seen_dtypes, seen_functions): + for func_id, c_name, arg_dtypes in seen_functions: + if isinstance(func_id, ArgExtFunction): + yield get_argext_preamble(func_id) + +# vim: fdm=marker diff --git a/loopy/symbolic.py b/loopy/symbolic.py index d2029c4628a697ab899cec314e2679acf4511f91..d91b94c001863ff0ffa21b58979ef52257e6cb41 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -5,7 +5,7 @@ from __future__ import division from pytools import memoize, memoize_method from pymbolic.primitives import ( - AlgebraicLeaf, Variable as VariableBase, + Leaf, AlgebraicLeaf, Variable as VariableBase, CommonSubexpression) from pymbolic.mapper import ( @@ -32,6 +32,12 @@ from islpy import dim_type # {{{ loopy-specific primitives +class FunctionIdentifier(Leaf): + def __getinitargs__(self): + return () + + mapper_method = intern("map_loopy_function_identifier") + class TypedCSE(CommonSubexpression): def __init__(self, child, prefix=None, dtype=None): CommonSubexpression.__init__(self, child, prefix) @@ -45,6 +51,11 @@ class TypedCSE(CommonSubexpression): class TaggedVariable(VariableBase): + """This is an identifier with a tag, such as 'matrix$one', where + 'one' identifies this specific use of the identifier. This mechanism + may then be used to address these uses--such as by prefetching only + accesses tagged a certain way. + """ def __init__(self, name, tag): VariableBase.__init__(self, name) self.tag = tag @@ -62,7 +73,7 @@ class Reduction(AlgebraicLeaf): assert isinstance(inames, tuple) if isinstance(operation, str): - from loopy.kernel import parse_reduction_op + from loopy.reduction import parse_reduction_op operation = parse_reduction_op(operation) self.operation = operation @@ -109,6 +120,9 @@ class IdentityMapperMixin(object): # leaf, doesn't change return expr + def map_loopy_function_identifier(self, expr): + return expr + class IdentityMapper(IdentityMapperBase, IdentityMapperMixin): pass @@ -143,6 +157,9 @@ class DependencyMapper(DependencyMapperBase): def map_tagged_variable(self, expr): return set([expr]) + def map_loopy_function_identifier(self, expr): + return set() + class UnidirectionalUnifier(UnidirectionalUnifierBase): def map_reduction(self, expr, other, unis): if not isinstance(other, type(expr)): @@ -224,7 +241,7 @@ class FunctionToPrimitiveMapper(IdentityMapper): else: # see if 'name' is an existing reduction op - from loopy.kernel import parse_reduction_op + from loopy.reduction import parse_reduction_op if parse_reduction_op(name): if len(expr.parameters) != 2: raise RuntimeError("invalid invocation of " diff --git a/test/test_loopy.py b/test/test_loopy.py index b799e88036a4e10e9af89a330777b11448f8737e..a40a157f48f85f5c10a7fee456df1be8335a2549 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -161,7 +161,7 @@ def test_argmax(ctx_factory): knl = lp.make_kernel(ctx.devices[0], "{[i]: 0<=i<%d}" % n, [ - "<> result = argmax_float32(i, fabs(a[i]))", + "<> result = argmax(i, fabs(a[i]))", "max_idx = result.index", "max_val = result.value", ],