diff --git a/.gitmodules b/.gitmodules index 504e23cf344e2d5ae35f6f6abe97458b8c7a39b8..41cf31d9cfcf85de99569ea131c3f367e68a3436 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ -[submodule "loopy/target/opencl/compyte"] - path = loopy/target/opencl/compyte +[submodule "loopy/target/c/compyte"] + path = loopy/target/c/compyte url = https://github.com/inducer/compyte diff --git a/bin/loopy b/bin/loopy index ed8f0ff16e4e82b02b09da126660db1baaed4035..2a657174d4e390f80331689068b8d21e8eab0de7 100644 --- a/bin/loopy +++ b/bin/loopy @@ -116,7 +116,7 @@ def main(): kernels = [kernel] - elif args.lang in ["fortran", "floopy"]: + elif args.lang in ["fortran", "floopy", "fpp"]: pre_transform_code = None if args.transform: with open(args.transform, "r") as xform_fd: @@ -132,7 +132,8 @@ def main(): + pre_transform_code) from loopy.frontend.fortran import f2loopy - kernels = f2loopy(infile_content, pre_transform_code=pre_transform_code) + kernels = f2loopy(infile_content, pre_transform_code=pre_transform_code, + use_c_preprocessor=(args.lang == "fpp")) if args.name is not None: kernels = [kernel for kernel in kernels diff --git a/examples/fortran/run-floopy.sh b/examples/fortran/run-floopy.sh index 6a718bb69ba9db8ec2a8f5cbf3b1314676d2f6ab..fcea2c8b5ed58eed8738ad263df62cdf687b3d0f 100755 --- a/examples/fortran/run-floopy.sh +++ b/examples/fortran/run-floopy.sh @@ -3,4 +3,4 @@ NAME="$1" shift -python $(which loopy) --lang=floopy "$NAME" - "$@" +python $(which loopy) --lang=fpp "$NAME" - "$@" diff --git a/loopy/__init__.py b/loopy/__init__.py index 3bc1d587c897d41b27c1f56e5a938881f58b0f03..12329bbf6472867c873958f1a18f20ba1016b5f7 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -31,7 +31,8 @@ import islpy as isl from islpy import dim_type from loopy.symbolic import (RuleAwareIdentityMapper, RuleAwareSubstitutionMapper, - SubstitutionRuleMappingContext) + SubstitutionRuleMappingContext, + TaggedVariable, Reduction, LinearSubscript, ) from loopy.diagnostic import LoopyError @@ -62,12 +63,14 @@ from loopy.padding import (split_arg_axis, find_padding_multiple, from loopy.preprocess import (preprocess_kernel, realize_reduction, infer_unknown_types) from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel -from loopy.codegen import generate_code +from loopy.codegen import generate_code, generate_body from loopy.compiled import CompiledKernel from loopy.options import Options from loopy.auto_test import auto_test_vs_ref __all__ = [ + "TaggedVariable", "Reduction", "LinearSubscript", + "auto", "LoopKernel", @@ -94,7 +97,7 @@ __all__ = [ "preprocess_kernel", "realize_reduction", "infer_unknown_types", "generate_loop_schedules", "get_one_scheduled_kernel", - "generate_code", + "generate_code", "generate_body", "CompiledKernel", @@ -555,9 +558,10 @@ def duplicate_inames(knl, inames, within, new_inames=None, suffix=None, # {{{ normalize arguments, find unique new_inames if isinstance(inames, str): - inames = inames.split(",") + inames = [iname.strip() for iname in inames.split(",")] + if isinstance(new_inames, str): - new_inames = new_inames.split(",") + new_inames = [iname.strip() for iname in new_inames.split(",")] from loopy.context_matching import parse_stack_match within = parse_stack_match(within) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index c14c1ffa24157a51c58b31a166913e94a558e062..26f225c6293e62ef43adc921bfef20a3a2a534b1 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -1,6 +1,4 @@ -from __future__ import division -from __future__ import absolute_import -import six +from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -24,6 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import six from loopy.diagnostic import LoopyError from pytools import Record @@ -146,6 +145,26 @@ def add_comment(cmt, code): # }}} +class SeenFunction(Record): + """ + .. attribute:: name + .. attribute:: c_name + .. attribute:: arg_dtypes + + a tuple of arg dtypes + """ + + def __init__(self, name, c_name, arg_dtypes): + Record.__init__(self, + name=name, + c_name=c_name, + arg_dtypes=arg_dtypes) + + def __hash__(self): + return hash((type(self),) + + tuple((f, getattr(self, f)) for f in type(self).fields)) + + # {{{ code generation state class CodeGenerationState(object): @@ -160,23 +179,26 @@ class CodeGenerationState(object): A :class:`frozenset` of predicates for which checks have been implemented. - .. attribute:: c_code_mapper + .. attribute:: expression_to_code_mapper A :class:`loopy.codegen.expression.CCodeMapper` that does not take per-ILP assignments into account. """ - def __init__(self, implemented_domain, implemented_predicates, c_code_mapper): + def __init__(self, implemented_domain, implemented_predicates, + expression_to_code_mapper): self.implemented_domain = implemented_domain self.implemented_predicates = implemented_predicates - self.c_code_mapper = c_code_mapper + self.expression_to_code_mapper = expression_to_code_mapper def copy(self, implemented_domain=None, implemented_predicates=frozenset(), - c_code_mapper=None): + expression_to_code_mapper=None): return CodeGenerationState( implemented_domain=implemented_domain or self.implemented_domain, implemented_predicates=( implemented_predicates or self.implemented_predicates), - c_code_mapper=c_code_mapper or self.c_code_mapper) + expression_to_code_mapper=( + expression_to_code_mapper + or self.expression_to_code_mapper)) def intersect(self, other): new_impl, new_other = isl.align_two(self.implemented_domain, other) @@ -205,7 +227,8 @@ class CodeGenerationState(object): new_impl_domain = new_impl_domain.add_constraint(cns) return self.copy( implemented_domain=new_impl_domain, - c_code_mapper=self.c_code_mapper.copy_and_assign(iname, expr)) + expression_to_code_mapper=( + self.expression_to_code_mapper.copy_and_assign(iname, expr))) # }}} @@ -367,34 +390,13 @@ def generate_code(kernel, device=None): from loopy.check import pre_codegen_checks pre_codegen_checks(kernel) - from cgen import (FunctionBody, FunctionDeclaration, - Value, Module, Block, - Line, Const, LiteralLines, Initializer) - logger.info("%s: generate code: start" % kernel.name) - from cgen.opencl import (CLKernel, CLRequiredWorkGroupSize) - - allow_complex = False - for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): - if var.dtype.kind == "c": - allow_complex = True - - seen_dtypes = set() - seen_functions = set() - - from loopy.codegen.expression import LoopyCCodeMapper - ccm = (LoopyCCodeMapper(kernel, seen_dtypes, seen_functions, - allow_complex=allow_complex)) - - mod = [] - - body = Block() - # {{{ examine arg list - from loopy.kernel.data import ImageArg, ValueArg + from loopy.kernel.data import ValueArg from loopy.kernel.array import ArrayBase + from cgen import Const impl_arg_info = [] @@ -417,55 +419,29 @@ def generate_code(kernel, device=None): else: raise ValueError("argument type not understood: '%s'" % type(arg)) - if any(isinstance(arg, ImageArg) for arg in kernel.args): - body.append(Initializer(Const(Value("sampler_t", "loopy_sampler")), - "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP " - "| CLK_FILTER_NEAREST")) + allow_complex = False + for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): + if var.dtype.kind == "c": + allow_complex = True # }}} - mod.extend([ - LiteralLines(r""" - #define lid(N) ((%(idx_ctype)s) get_local_id(N)) - #define gid(N) ((%(idx_ctype)s) get_group_id(N)) - """ % dict(idx_ctype=kernel.target.dtype_to_typename(kernel.index_dtype))), - Line()]) - - # {{{ build lmem array declarators for temporary variables - - body.extend( - idi.cgen_declarator - for tv in six.itervalues(kernel.temporary_variables) - for idi in tv.decl_info( - kernel.target, - is_written=True, index_dtype=kernel.index_dtype)) - - # }}} + seen_dtypes = set() + seen_functions = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), - c_code_mapper=ccm) - - from loopy.codegen.loop import set_up_hw_parallel_loops - gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state) + expression_to_code_mapper=kernel.target.get_expression_to_code_mapper( + kernel, seen_dtypes, seen_functions, allow_complex)) - body.append(Line()) + code_str, implemented_domains = kernel.target.generate_code( + kernel, codegen_state, impl_arg_info) - if isinstance(gen_code.ast, Block): - body.extend(gen_code.ast.contents) - else: - body.append(gen_code.ast) - - mod.append( - FunctionBody( - CLRequiredWorkGroupSize( - kernel.get_grid_sizes_as_exprs()[1], - CLKernel(FunctionDeclaration( - Value("void", kernel.name), - [iai.cgen_declarator for iai in impl_arg_info]))), - body)) + from loopy.check import check_implemented_domains + assert check_implemented_domains(kernel, implemented_domains, + code_str) # {{{ handle preambles @@ -491,20 +467,18 @@ def generate_code(kernel, device=None): seen_preamble_tags.add(tag) dedup_preambles.append(preamble) - mod = ([LiteralLines(lines) for lines in dedup_preambles] - + [Line()] + mod) - - # }}} + from loopy.tools import remove_common_indentation + preamble_codes = [ + remove_common_indentation(lines) + "\n" + for lines in dedup_preambles] - result = str(Module(mod)) + code_str = "".join(preamble_codes) + code_str - from loopy.check import check_implemented_domains - assert check_implemented_domains(kernel, gen_code.implemented_domains, - result) + # }}} logger.info("%s: generate code: done" % kernel.name) - result = result, impl_arg_info + result = code_str, impl_arg_info if CACHING_ENABLED: code_gen_cache[input_kernel] = result @@ -514,4 +488,51 @@ def generate_code(kernel, device=None): # }}} +# {{{ generate function body + +def generate_body(kernel): + if kernel.schedule is None: + from loopy.schedule import get_one_scheduled_kernel + kernel = get_one_scheduled_kernel(kernel) + from loopy.kernel import kernel_state + if kernel.state != kernel_state.SCHEDULED: + raise LoopyError("cannot generate code for a kernel that has not been " + "scheduled") + + from loopy.preprocess import infer_unknown_types + kernel = infer_unknown_types(kernel, expect_completion=True) + + from loopy.check import pre_codegen_checks + pre_codegen_checks(kernel) + + logger.info("%s: generate code: start" % kernel.name) + + allow_complex = False + for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): + if var.dtype.kind == "c": + allow_complex = True + + seen_dtypes = set() + seen_functions = set() + + initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) + codegen_state = CodeGenerationState( + implemented_domain=initial_implemented_domain, + implemented_predicates=frozenset(), + expression_to_code_mapper=kernel.target.get_expression_to_code_mapper( + kernel, seen_dtypes, seen_functions, allow_complex)) + + code_str, implemented_domains = kernel.target.generate_body( + kernel, codegen_state) + + from loopy.check import check_implemented_domains + assert check_implemented_domains(kernel, implemented_domains, + code_str) + + logger.info("%s: generate code: done" % kernel.name) + + return code_str + +# }}} + # vim: foldmethod=marker diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index c9e79f6bcc17f3ed3166dfc18cc012721410af6c..19ac4106ba58821e5d4bf5231eb530977739c7f3 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -28,14 +28,14 @@ from islpy import dim_type from pymbolic.mapper.stringifier import PREC_NONE -def constraint_to_code(ccm, cns): +def constraint_to_code(ecm, cns): if cns.is_equality(): comp_op = "==" else: comp_op = ">=" from loopy.symbolic import constraint_to_expr - return "%s %s 0" % (ccm(constraint_to_expr(cns), PREC_NONE, "i"), comp_op) + return "%s %s 0" % (ecm(constraint_to_expr(cns), PREC_NONE, "i"), comp_op) # {{{ bounds check generator diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 3927293ae82dc521327a8c17afae5e93be1e3aa9..b28244a8c01e72611244f33901793c37804b00ad 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -77,7 +77,7 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state): elif isinstance(sched_item, Barrier): from loopy.codegen import GeneratedInstruction - from cgen import Statement as S + from cgen import Statement as S # noqa if sched_item.comment: comment = " /* %s */" % sched_item.comment @@ -362,7 +362,8 @@ def build_loop_nest(kernel, sched_index, codegen_state): from loopy.codegen.bounds import constraint_to_code conditionals = [ - constraint_to_code(codegen_state.c_code_mapper, cns) + constraint_to_code( + codegen_state.expression_to_code_mapper, cns) for cns in bounds_checks] + list(pred_checks) result = [wrap_in_if(conditionals, gen_code_block(result))] diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 1bd977f3ea2e8e15e63424e89b1e347063da70e2..b98716a4e6281ff559abc670a8e1b0fe5d67976d 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -45,7 +45,9 @@ def wrap_in_conditionals(codegen_state, domain, check_inames, required_preds, st if bounds_check_set.is_empty(): return None, None - condition_codelets = [constraint_to_code(codegen_state.c_code_mapper, cns) + condition_codelets = [ + constraint_to_code( + codegen_state.expression_to_code_mapper, cns) for cns in bounds_checks] condition_codelets.extend( @@ -86,7 +88,7 @@ def generate_instruction_code(kernel, insn, codegen_state): def generate_expr_instruction_code(kernel, insn, codegen_state): - ccm = codegen_state.c_code_mapper + ecm = codegen_state.expression_to_code_mapper expr = insn.expression @@ -94,16 +96,16 @@ def generate_expr_instruction_code(kernel, insn, codegen_state): target_dtype = kernel.get_var_descriptor(assignee_var_name).dtype from cgen import Assign - from loopy.codegen.expression import dtype_to_type_context - lhs_code = ccm(insn.assignee, prec=PREC_NONE, type_context=None) + from loopy.expression import dtype_to_type_context + lhs_code = ecm(insn.assignee, prec=PREC_NONE, type_context=None) result = Assign( lhs_code, - ccm(expr, prec=PREC_NONE, + ecm(expr, prec=PREC_NONE, type_context=dtype_to_type_context(kernel.target, target_dtype), needed_dtype=target_dtype)) if kernel.options.trace_assignments or kernel.options.trace_assignment_values: - from cgen import Statement as S + from cgen import Statement as S # noqa gs, ls = kernel.get_grid_sizes() @@ -123,7 +125,7 @@ def generate_expr_instruction_code(kernel, insn, codegen_state): if assignee_indices: printf_format += "[%s]" % ",".join(len(assignee_indices) * ["%d"]) printf_args.extend( - ccm(i, prec=PREC_NONE, type_context="i") + ecm(i, prec=PREC_NONE, type_context="i") for i in assignee_indices) if kernel.options.trace_assignment_values: @@ -158,7 +160,7 @@ def generate_expr_instruction_code(kernel, insn, codegen_state): def generate_c_instruction_code(kernel, insn, codegen_state): - ccm = codegen_state.c_code_mapper + ecm = codegen_state.expression_to_code_mapper body = [] @@ -168,14 +170,14 @@ def generate_c_instruction_code(kernel, insn, codegen_state): from pymbolic.primitives import Variable for name, iname_expr in insn.iname_exprs: if (isinstance(iname_expr, Variable) - and name not in ccm.var_subst_map): + and name not in ecm.var_subst_map): # No need, the bare symbol will work continue body.append( Initializer( POD(kernel.target, kernel.index_dtype, name), - codegen_state.c_code_mapper( + codegen_state.expression_to_code_mapper( iname_expr, prec=PREC_NONE, type_context="i"))) if body: diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 40f433a90f90855aa554261b8d6fb5cbdd5fb0ec..dcea94690c2cca630b40a870f777d884715f1dd4 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -250,8 +250,9 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, # slabbing conditionals. slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname) new_codegen_state = codegen_state.copy( - c_code_mapper=codegen_state.c_code_mapper.copy_and_assign( - iname, hw_axis_expr)) + expression_to_code_mapper=( + codegen_state.expression_to_code_mapper.copy_and_assign( + iname, hw_axis_expr))) inner = set_up_hw_parallel_loops( slabbed_kernel, sched_index, @@ -268,7 +269,7 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, # {{{ sequential loop def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): - ccm = codegen_state.c_code_mapper + ecm = codegen_state.expression_to_code_mapper loop_iname = kernel.schedule[sched_index].iname slabs = get_slab_decomposition( @@ -319,11 +320,11 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): static_lbound = static_min_of_pw_aff( kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx).coalesce(), - constants_only=False) + constants_only=False, prefer_constants=False) static_ubound = static_max_of_pw_aff( kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx).coalesce(), - constants_only=False) + constants_only=False, prefer_constants=False) # }}} @@ -362,7 +363,7 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): # single-trip, generate just a variable assignment, not a loop result.append(gen_code_block([ Initializer(Const(POD(kernel.index_dtype, loop_iname)), - ccm(aff_to_expr(static_lbound), PREC_NONE, "i")), + ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), Line(), inner, ])) @@ -373,9 +374,9 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): result.append(wrap_in(For, "%s %s = %s" % (kernel.target.dtype_to_typename(kernel.index_dtype), - loop_iname, ccm(aff_to_expr(static_lbound), PREC_NONE, "i")), + loop_iname, ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), "%s <= %s" % ( - loop_iname, ccm(aff_to_expr(static_ubound), PREC_NONE, "i")), + loop_iname, ecm(aff_to_expr(static_ubound), PREC_NONE, "i")), "++%s" % loop_iname, inner)) diff --git a/loopy/expression.py b/loopy/expression.py new file mode 100644 index 0000000000000000000000000000000000000000..2afb803b97b0e5b9f9ff1510da92ad10a50711dc --- /dev/null +++ b/loopy/expression.py @@ -0,0 +1,257 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2012-15 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +import numpy as np + +from pymbolic.mapper import CombineMapper + +from loopy.tools import is_integer +from loopy.diagnostic import TypeInferenceFailure, DependencyTypeInferenceFailure + + +# type_context may be: +# - 'i' for integer - +# - 'f' for single-precision floating point +# - 'd' for double-precision floating point +# or None for 'no known context'. + +def dtype_to_type_context(target, dtype): + dtype = np.dtype(dtype) + + if dtype.kind == 'i': + return 'i' + if dtype in [np.float64, np.complex128]: + return 'd' + if dtype in [np.float32, np.complex64]: + return 'f' + if target.is_vector_dtype(dtype): + return dtype_to_type_context(target, dtype.fields["x"][0]) + + return None + + +# {{{ type inference + +class TypeInferenceMapper(CombineMapper): + def __init__(self, kernel, new_assignments=None): + """ + :arg new_assignments: mapping from names to either + :class:`loopy.kernel.data.TemporaryVariable` + or + :class:`loopy.kernel.data.KernelArgument` + instances + """ + self.kernel = kernel + if new_assignments is None: + new_assignments = {} + self.new_assignments = new_assignments + + # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x) + # are Python-equal (for many common constants such as integers). + + @staticmethod + def combine(dtypes): + dtypes = list(dtypes) + + result = dtypes.pop() + while dtypes: + other = dtypes.pop() + + if result.isbuiltin and other.isbuiltin: + if (result, other) in [ + (np.int32, np.float32), (np.int32, np.float32)]: + # numpy makes this a double. I disagree. + result = np.dtype(np.float32) + else: + result = ( + np.empty(0, dtype=result) + + np.empty(0, dtype=other) + ).dtype + elif result.isbuiltin and not other.isbuiltin: + # assume the non-native type takes over + result = other + elif not result.isbuiltin and other.isbuiltin: + # assume the non-native type takes over + pass + else: + if result is not other: + raise TypeInferenceFailure( + "nothing known about result of operation on " + "'%s' and '%s'" % (result, other)) + + return result + + def map_sum(self, expr): + dtypes = [] + small_integer_dtypes = [] + for child in expr.children: + dtype = self.rec(child) + if is_integer(child) and abs(child) < 1024: + small_integer_dtypes.append(dtype) + else: + dtypes.append(dtype) + + from pytools import all + if all(dtype.kind == "i" for dtype in dtypes): + dtypes.extend(small_integer_dtypes) + + return self.combine(dtypes) + + map_product = map_sum + + def map_quotient(self, expr): + n_dtype = self.rec(expr.numerator) + d_dtype = self.rec(expr.denominator) + + if n_dtype.kind in "iu" and d_dtype.kind in "iu": + # both integers + return np.dtype(np.float64) + + else: + return self.combine([n_dtype, d_dtype]) + + def map_constant(self, expr): + if is_integer(expr): + for tp in [np.int32, np.int64]: + iinfo = np.iinfo(tp) + if iinfo.min <= expr <= iinfo.max: + return np.dtype(tp) + + else: + raise TypeInferenceFailure("integer constant '%s' too large" % expr) + + dt = np.asarray(expr).dtype + if hasattr(expr, "dtype"): + return expr.dtype + elif isinstance(expr, np.number): + # Numpy types are sized + return np.dtype(type(expr)) + elif dt.kind == "f": + # deduce the smaller type by default + return np.dtype(np.float32) + elif dt.kind == "c": + if np.complex64(expr) == np.complex128(expr): + # (COMPLEX_GUESS_LOGIC) + # No precision is lost by 'guessing' single precision, use that. + # This at least covers simple cases like '1j'. + return np.dtype(np.complex64) + + # Codegen for complex types depends on exactly correct types. + # Refuse temptation to guess. + raise TypeInferenceFailure("Complex constant '%s' needs to " + "be sized for type inference " % expr) + else: + raise TypeInferenceFailure("Cannot deduce type of constant '%s'" % expr) + + def map_subscript(self, expr): + return self.rec(expr.aggregate) + + def map_linear_subscript(self, expr): + return self.rec(expr.aggregate) + + def map_call(self, expr): + from pymbolic.primitives import Variable + + identifier = expr.function + if isinstance(identifier, Variable): + identifier = identifier.name + + arg_dtypes = tuple(self.rec(par) for par in expr.parameters) + + mangle_result = self.kernel.mangle_function(identifier, arg_dtypes) + if mangle_result is not None: + return mangle_result[0] + + raise RuntimeError("no type inference information on " + "function '%s'" % identifier) + + def map_variable(self, expr): + if expr.name in self.kernel.all_inames(): + return self.kernel.index_dtype + + result = self.kernel.mangle_symbol(expr.name) + if result is not None: + result_dtype, _ = result + return result_dtype + + obj = self.new_assignments.get(expr.name) + + if obj is None: + obj = self.kernel.arg_dict.get(expr.name) + + if obj is None: + obj = self.kernel.temporary_variables.get(expr.name) + + if obj is None: + raise TypeInferenceFailure("name not known in type inference: %s" + % expr.name) + + from loopy.kernel.data import TemporaryVariable, KernelArgument + import loopy as lp + if isinstance(obj, TemporaryVariable): + result = obj.dtype + if result is lp.auto: + raise DependencyTypeInferenceFailure( + "temporary variable '%s'" % expr.name, + expr.name) + else: + return result + + elif isinstance(obj, KernelArgument): + result = obj.dtype + if result is None: + raise DependencyTypeInferenceFailure( + "argument '%s'" % expr.name, + expr.name) + else: + return result + + else: + raise RuntimeError("unexpected type inference " + "object type for '%s'" % expr.name) + + map_tagged_variable = map_variable + + def map_lookup(self, expr): + agg_result = self.rec(expr.aggregate) + dtype, offset = agg_result.fields[expr.name] + return dtype + + def map_comparison(self, expr): + # "bool" is unusable because OpenCL's bool has indeterminate memory + # format. + return np.dtype(np.int32) + + map_logical_not = map_comparison + map_logical_and = map_comparison + map_logical_or = map_comparison + + def map_reduction(self, expr): + return expr.operation.result_dtype( + self.kernel.target, self.rec(expr.expr), expr.inames) + +# }}} + +# vim: fdm=marker diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index f169eeef52ef7f029dedcf9412dfa6ef828343d6..1a610b63cb35f4931e41446f00c16a182de8430a 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -22,17 +22,49 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from loopy.diagnostic import LoopyError + def f2loopy(source, free_form=True, strict=True, - pre_transform_code=None): + pre_transform_code=None, pre_transform_code_context=None, + use_c_preprocessor=False, + file_name="<floopy code>"): + if use_c_preprocessor: + try: + import ply.lex as lex + import ply.cpp as cpp + except ImportError: + raise LoopyError("Using the C preprocessor requires PLY to be installed") + + lexer = lex.lex(cpp) + + from ply.cpp import Preprocessor + p = Preprocessor(lexer) + p.parse(source, file_name) + + tokens = [] + while True: + tok = p.token() + + if not tok: + break + + if tok.type == "CPP_COMMENT": + continue + + tokens.append(tok.value) + + source = "".join(tokens) + from fparser import api tree = api.parse(source, isfree=free_form, isstrict=strict, analyze=False, ignore_comments=False) from loopy.frontend.fortran.translator import F2LoopyTranslator - f2loopy = F2LoopyTranslator() + f2loopy = F2LoopyTranslator(file_name) f2loopy(tree) - return f2loopy.make_kernels(pre_transform_code=pre_transform_code) + return f2loopy.make_kernels(pre_transform_code=pre_transform_code, + pre_transform_code_context=pre_transform_code_context) # vim: foldmethod=marker diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index a6b5b422bcaa413033ecce4808be581139394876..abd925c14fe9ebf14342ac5ac25e2062291feefc 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -214,7 +214,7 @@ def remove_common_indentation(lines): # {{{ translator class F2LoopyTranslator(FTreeWalkerBase): - def __init__(self): + def __init__(self, filename): FTreeWalkerBase.__init__(self) self.scope_stack = [] @@ -234,6 +234,8 @@ class F2LoopyTranslator(FTreeWalkerBase): self.transform_code_lines = [] + self.filename = filename + def add_expression_instruction(self, lhs, rhs): scope = self.scope_stack[-1] @@ -331,7 +333,7 @@ class F2LoopyTranslator(FTreeWalkerBase): tp = self.dtype_from_stmt(node) - for name, shape in self.parse_dimension_specs(node.entity_decls): + for name, shape in self.parse_dimension_specs(node, node.entity_decls): if shape is not None: assert name not in scope.dim_map scope.dim_map[name] = shape @@ -350,7 +352,7 @@ class F2LoopyTranslator(FTreeWalkerBase): def map_Dimension(self, node): scope = self.scope_stack[-1] - for name, shape in self.parse_dimension_specs(node.items): + for name, shape in self.parse_dimension_specs(node, node.items): if shape is not None: assert name not in scope.dim_map scope.dim_map[name] = shape @@ -369,7 +371,7 @@ class F2LoopyTranslator(FTreeWalkerBase): for name, data in node.stmts: name, = name assert name not in scope.data - scope.data[name] = [self.parse_expr(i) for i in data] + scope.data[name] = [self.parse_expr(node, i) for i in data] return [] @@ -399,7 +401,7 @@ class F2LoopyTranslator(FTreeWalkerBase): scope = self.scope_stack[-1] lhs = scope.process_expression_for_loopy( - self.parse_expr(node.variable)) + self.parse_expr(node, node.variable)) from pymbolic.primitives import Subscript, Call if isinstance(lhs, Call): raise TranslationError("function call (to '%s') on left hand side of" @@ -411,7 +413,7 @@ class F2LoopyTranslator(FTreeWalkerBase): scope.use_name(lhs_name) - rhs = scope.process_expression_for_loopy(self.parse_expr(node.expr)) + rhs = scope.process_expression_for_loopy(self.parse_expr(node, node.expr)) self.add_expression_instruction(lhs, rhs) @@ -425,9 +427,7 @@ class F2LoopyTranslator(FTreeWalkerBase): raise NotImplementedError("save") def map_Line(self, node): - #from warnings import warn - #warn("Encountered a 'line': %s" % node) - raise NotImplementedError + pass def map_Program(self, node): raise NotImplementedError @@ -467,7 +467,7 @@ class F2LoopyTranslator(FTreeWalkerBase): cond_var = var(cond_name) self.add_expression_instruction( - cond_var, self.parse_expr(node.expr)) + cond_var, self.parse_expr(node, node.expr)) self.conditions.append(cond_name) @@ -489,6 +489,7 @@ class F2LoopyTranslator(FTreeWalkerBase): loop_var = loop_var.strip() scope.use_name(loop_var) loop_bounds = self.parse_expr( + node, loop_bounds, min_precedence=self.expr_parser._PREC_FUNC_ARGS) if len(loop_bounds) == 2: @@ -627,12 +628,16 @@ class F2LoopyTranslator(FTreeWalkerBase): # }}} - def make_kernels(self, pre_transform_code=None): + def make_kernels(self, pre_transform_code=None, pre_transform_code_context=None): kernel_names = [ sub.subprogram_name for sub in self.kernels] - proc_dict = {} + if pre_transform_code_context is None: + proc_dict = {} + else: + proc_dict = pre_transform_code_context.copy() + proc_dict["lp"] = lp proc_dict["np"] = np diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py index 4291d98749cb84bf743c3e8c745052190000cd03..b1df6e3d01317a315354cf10a55b9312090dc61f 100644 --- a/loopy/frontend/fortran/tree.py +++ b/loopy/frontend/fortran/tree.py @@ -24,6 +24,8 @@ THE SOFTWARE. import re +from loopy.diagnostic import LoopyError + class FTreeWalkerBase(object): def __init__(self): @@ -53,13 +55,13 @@ class FTreeWalkerBase(object): r"^(?P<name>[_0-9a-zA-Z]+)" "(\((?P<shape>[-+*0-9:a-zA-Z, \t]+)\))?$") - def parse_dimension_specs(self, dim_decls): + def parse_dimension_specs(self, node, dim_decls): def parse_bounds(bounds_str): start_end = bounds_str.split(":") assert 1 <= len(start_end) <= 2 - return [self.parse_expr(s) for s in start_end] + return [self.parse_expr(node, s) for s in start_end] for decl in dim_decls: entity_match = self.ENTITY_RE.match(decl) @@ -81,8 +83,13 @@ class FTreeWalkerBase(object): # {{{ expressions - def parse_expr(self, expr_str, **kwargs): - return self.expr_parser(expr_str, **kwargs) + def parse_expr(self, node, expr_str, **kwargs): + try: + return self.expr_parser(expr_str, **kwargs) + except Exception as e: + raise LoopyError( + "Error parsing expression '%s' on line %d of '%s': %s" + % (expr_str, node.item.span[0], self.filename, str(e))) # }}} diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index fdf38b768269dfc8e86c76de9fd1dcf05e4e86e6..b876513f754209f7ff498f4b9243a9ac54cce7f8 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -137,7 +137,8 @@ def iname_rel_aff(space, iname, rel, aff): raise ValueError("unknown value of 'rel': %s" % rel) -def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context): +def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context, + prefer_constants): if context is not None: context = isl.align_spaces(context, pw_aff.get_domain_space(), obj_bigger_ok=True).params() @@ -163,12 +164,21 @@ def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context) # put constant bounds with unbounded validity first # FIXME: Heuristi-hack. - order = [ - (True, False), # constant, unbounded validity - (False, False), # nonconstant, unbounded validity - (True, True), # constant, bounded validity - (False, True), # nonconstant, bounded validity - ] + if prefer_constants: + order = [ + (True, False), # constant, unbounded validity + (False, False), # nonconstant, unbounded validity + (True, True), # constant, bounded validity + (False, True), # nonconstant, bounded validity + ] + else: + order = [ + (False, False), # nonconstant, unbounded validity + (True, False), # constant, unbounded validity + (False, True), # nonconstant, bounded validity + (True, True), # constant, bounded validity + ] + pieces = flatten([ [(set, aff) for set, aff in pieces if aff.is_cst() == want_is_constant @@ -199,19 +209,22 @@ def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context) % (what, pw_aff)) -def static_min_of_pw_aff(pw_aff, constants_only, context=None): +def static_min_of_pw_aff(pw_aff, constants_only, context=None, + prefer_constants=True): return static_extremum_of_pw_aff(pw_aff, constants_only, isl.PwAff.ge_set, - "minimum", context) + "minimum", context, prefer_constants) -def static_max_of_pw_aff(pw_aff, constants_only, context=None): +def static_max_of_pw_aff(pw_aff, constants_only, context=None, + prefer_constants=True): return static_extremum_of_pw_aff(pw_aff, constants_only, isl.PwAff.le_set, - "maximum", context) + "maximum", context, prefer_constants) -def static_value_of_pw_aff(pw_aff, constants_only, context=None): +def static_value_of_pw_aff(pw_aff, constants_only, context=None, + prefer_constants=True): return static_extremum_of_pw_aff(pw_aff, constants_only, isl.PwAff.eq_set, - "value", context) + "value", context, prefer_constants) def duplicate_axes(isl_obj, duplicate_inames, new_inames): @@ -363,4 +376,12 @@ def simplify_via_aff(expr): expr)) +def project_out(set, inames): + for iname in inames: + var_dict = set.get_var_dict() + dt, dim_idx = var_dict[iname] + set = set.project_out(dt, dim_idx, 1) + + return set + # vim: foldmethod=marker diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index e99dedbf9b4ec36dfebe082da970ff260e2fb27e..7edf43e43bceb6ee3622f3645ad37c4375a703e0 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -172,7 +172,7 @@ def parse_insn(insn): elif subst_match is not None: groups = subst_match.groupdict() else: - raise RuntimeError("isntruction parse error at '%s'" % insn) + raise RuntimeError("instruction parse error at '%s'" % insn) from loopy.symbolic import parse try: diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index ab747b9a66f82044e0fe7dc4591867fdd34d3519..32c062689b47acd688933a1f615af3cd6f67135a 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -747,34 +747,6 @@ class ExpressionInstruction(InstructionBase): # }}} -def _remove_common_indentation(code): - if "\n" not in code: - return code - - # accommodate pyopencl-ish syntax highlighting - code = code.lstrip("//CL//") - - if not code.startswith("\n"): - return code - - lines = code.split("\n") - while lines[0].strip() == "": - lines.pop(0) - while lines[-1].strip() == "": - lines.pop(-1) - - if lines: - base_indent = 0 - while lines[0][base_indent] in " \t": - base_indent += 1 - - for line in lines[1:]: - if line[:base_indent].strip(): - raise ValueError("inconsistent indentation") - - return "\n".join(line[base_indent:] for line in lines) - - # {{{ c instruction class CInstruction(InstructionBase): @@ -873,7 +845,8 @@ class CInstruction(InstructionBase): # }}} self.iname_exprs = new_iname_exprs - self.code = _remove_common_indentation(code) + from loopy.tools import remove_common_indentation + self.code = remove_common_indentation(code) self.read_variables = read_variables self.assignees = new_assignees diff --git a/loopy/precompute.py b/loopy/precompute.py index 5675537f1f3915dcce8b68b2d36200bdcaef3f5a..935d6d44040cf56b3ca3bdbbec26e22ec750a05d 100644 --- a/loopy/precompute.py +++ b/loopy/precompute.py @@ -473,7 +473,8 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, if precompute_inames is not None and i < len(precompute_inames): name = precompute_inames[i] tag_lookup_saxis = name - if not precompute_inames_already_exist and var_name_gen.is_name_conflicting(name): + if (not precompute_inames_already_exist + and var_name_gen.is_name_conflicting(name)): raise RuntimeError("new storage axis name '%s' " "conflicts with existing name" % name) @@ -543,8 +544,34 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, domch.domain, non1_storage_axis_names, boxify_sweep=fetch_bounding_box)) else: - new_kernel_domains = domch.get_domains_with(domch.domain) + check_domain = domch.domain + + # {{{ check the domain the preexisting inames' domain + + # inames already exist in check_domain, add them primed + primed_non1_saxis_names = [ + iname+"'" for iname in non1_storage_axis_names] + + check_domain = abm.augment_domain_with_sweep( + check_domain, primed_non1_saxis_names, + boxify_sweep=fetch_bounding_box) + + # project out the original copies + from loopy.isl_helpers import project_out + check_domain = project_out(check_domain, non1_storage_axis_names) + for iname in non1_storage_axis_names: + var_dict = check_domain.get_var_dict() + dt, dim_idx = var_dict[iname+"'"] + check_domain = check_domain.set_dim_name(dt, dim_idx, iname) + + if not (check_domain <= domch.domain and domch.domain <= check_domain): + raise LoopyError("domain of preexisting inames does not match " + "domain needed for precompute") + + # }}} + + new_kernel_domains = domch.get_domains_with(domch.domain) else: # leave kernel domains unchanged diff --git a/loopy/preprocess.py b/loopy/preprocess.py index f9cd3f3eb2140354decf61b475ba1742d5ea6832..9de31233b831215bba72ba1e226b0df555b42286 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -162,7 +162,7 @@ def infer_unknown_types(kernel, expect_completion=False): # }}} - from loopy.codegen.expression import TypeInferenceMapper + from loopy.expression import TypeInferenceMapper type_inf_mapper = TypeInferenceMapper(kernel, _DictUnionView([ new_temp_vars, @@ -401,7 +401,7 @@ def realize_reduction(kernel, insn_id_filter=None): var_name_gen = kernel.get_var_name_generator() new_temporary_variables = kernel.temporary_variables.copy() - from loopy.codegen.expression import TypeInferenceMapper + from loopy.expression import TypeInferenceMapper type_inf_mapper = TypeInferenceMapper(kernel) def map_reduction(expr, rec): diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 8897f35938ee29e50d70edfdc52945bc2ad856ac..bad7f840f648bc72d7f505fb72a200f3df2fb0d6 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -305,7 +305,27 @@ class Reduction(AlgebraicLeaf): init_arg_names = ("operation", "inames", "expr") def __init__(self, operation, inames, expr): + if isinstance(inames, str): + inames = tuple(iname.strip() for iname in inames.split(",")) + + elif isinstance(inames, Variable): + inames = (inames,) + assert isinstance(inames, tuple) + + def strip_var(iname): + if isinstance(iname, Variable): + iname = iname.name + + assert isinstance(iname, str) + return iname + + inames = tuple(strip_var(iname) for iname in inames) + + if isinstance(operation, str): + from loopy.library.reduction import parse_reduction_op + operation = parse_reduction_op(operation) + from loopy.library.reduction import ReductionOperation assert isinstance(operation, ReductionOperation) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..89b0d578b4871bc6d51177b6f9d6f6edd893db1a --- /dev/null +++ b/loopy/target/c/__init__.py @@ -0,0 +1,114 @@ +"""OpenCL target independent of PyOpenCL.""" + +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2015 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six + +import numpy as np # noqa +from loopy.target import TargetBase + +from pytools import memoize_method + + +class CTarget(TargetBase): + @memoize_method + def get_dtype_registry(self): + from loopy.target.c.compyte.dtypes import ( + DTypeRegistry, fill_with_registry_with_c_types) + result = DTypeRegistry() + fill_with_registry_with_c_types(result, respect_windows=False, + include_bool=True) + return result + + def is_vector_dtype(self, dtype): + return False + + def get_vector_dtype(self, base, count): + raise KeyError() + + def get_or_register_dtype(self, names, dtype=None): + return self.get_dtype_registry().get_or_register_dtype(names, dtype) + + def dtype_to_typename(self, dtype): + return self.get_dtype_registry().dtype_to_ctype(dtype) + + def get_expression_to_code_mapper(self, kernel, + seen_dtypes, seen_functions, allow_complex): + from loopy.target.c.codegen.expression import LoopyCCodeMapper + return (LoopyCCodeMapper(kernel, seen_dtypes, seen_functions, + allow_complex=allow_complex)) + + # {{{ code generation + + def generate_code(self, kernel, codegen_state, impl_arg_info): + from cgen import FunctionBody, FunctionDeclaration, Value, Module + + body, implemented_domains = kernel.target.generate_body( + kernel, codegen_state) + + mod = Module([ + FunctionBody( + kernel.target.wrap_function_declaration( + kernel, + FunctionDeclaration( + Value("void", kernel.name), + [iai.cgen_declarator for iai in impl_arg_info])), + body) + ]) + + return str(mod), implemented_domains + + def wrap_function_declaration(self, kernel, fdecl): + return fdecl + + def generate_body(self, kernel, codegen_state): + from cgen import Block + body = Block() + + # {{{ declare temporaries + + body.extend( + idi.cgen_declarator + for tv in six.itervalues(kernel.temporary_variables) + for idi in tv.decl_info( + kernel.target, + is_written=True, index_dtype=kernel.index_dtype)) + + # }}} + + from loopy.codegen.loop import set_up_hw_parallel_loops + gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state) + + from cgen import Line + body.append(Line()) + + if isinstance(gen_code.ast, Block): + body.extend(gen_code.ast.contents) + else: + body.append(gen_code.ast) + + return body, gen_code.implemented_domains + + # }}} diff --git a/loopy/target/c/codegen/__init__.py b/loopy/target/c/codegen/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/loopy/codegen/expression.py b/loopy/target/c/codegen/expression.py similarity index 77% rename from loopy/codegen/expression.py rename to loopy/target/c/codegen/expression.py index f834c8fa50287c14aec1417ebef093dffd5062d9..1dc3aafcfb63789255a7f5fd79b178e657b3c74d 100644 --- a/loopy/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -31,238 +31,11 @@ import numpy as np from pymbolic.mapper import RecursiveMapper from pymbolic.mapper.stringifier import (PREC_NONE, PREC_CALL, PREC_PRODUCT, PREC_POWER) -from pymbolic.mapper import CombineMapper import islpy as isl -from pytools import Record -from loopy.tools import is_integer -from loopy.diagnostic import TypeInferenceFailure, DependencyTypeInferenceFailure - - -# {{{ type inference - -class TypeInferenceMapper(CombineMapper): - def __init__(self, kernel, new_assignments=None): - """ - :arg new_assignments: mapping from names to either - :class:`loopy.kernel.data.TemporaryVariable` - or - :class:`loopy.kernel.data.KernelArgument` - instances - """ - self.kernel = kernel - if new_assignments is None: - new_assignments = {} - self.new_assignments = new_assignments - - # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x) - # are Python-equal (for many common constants such as integers). - - @staticmethod - def combine(dtypes): - dtypes = list(dtypes) - - result = dtypes.pop() - while dtypes: - other = dtypes.pop() - - if result.isbuiltin and other.isbuiltin: - if (result, other) in [ - (np.int32, np.float32), (np.int32, np.float32)]: - # numpy makes this a double. I disagree. - result = np.dtype(np.float32) - else: - result = ( - np.empty(0, dtype=result) - + np.empty(0, dtype=other) - ).dtype - elif result.isbuiltin and not other.isbuiltin: - # assume the non-native type takes over - result = other - elif not result.isbuiltin and other.isbuiltin: - # assume the non-native type takes over - pass - else: - if result is not other: - raise TypeInferenceFailure( - "nothing known about result of operation on " - "'%s' and '%s'" % (result, other)) - - return result - - def map_sum(self, expr): - dtypes = [] - small_integer_dtypes = [] - for child in expr.children: - dtype = self.rec(child) - if is_integer(child) and abs(child) < 1024: - small_integer_dtypes.append(dtype) - else: - dtypes.append(dtype) - - from pytools import all - if all(dtype.kind == "i" for dtype in dtypes): - dtypes.extend(small_integer_dtypes) - - return self.combine(dtypes) - - map_product = map_sum - - def map_quotient(self, expr): - n_dtype = self.rec(expr.numerator) - d_dtype = self.rec(expr.denominator) - - if n_dtype.kind in "iu" and d_dtype.kind in "iu": - # both integers - return np.dtype(np.float64) - - else: - return self.combine([n_dtype, d_dtype]) - - def map_constant(self, expr): - if is_integer(expr): - for tp in [np.int32, np.int64]: - iinfo = np.iinfo(tp) - if iinfo.min <= expr <= iinfo.max: - return np.dtype(tp) - - else: - raise TypeInferenceFailure("integer constant '%s' too large" % expr) - - dt = np.asarray(expr).dtype - if hasattr(expr, "dtype"): - return expr.dtype - elif isinstance(expr, np.number): - # Numpy types are sized - return np.dtype(type(expr)) - elif dt.kind == "f": - # deduce the smaller type by default - return np.dtype(np.float32) - elif dt.kind == "c": - if np.complex64(expr) == np.complex128(expr): - # (COMPLEX_GUESS_LOGIC) - # No precision is lost by 'guessing' single precision, use that. - # This at least covers simple cases like '1j'. - return np.dtype(np.complex64) - - # Codegen for complex types depends on exactly correct types. - # Refuse temptation to guess. - raise TypeInferenceFailure("Complex constant '%s' needs to " - "be sized for type inference " % expr) - else: - raise TypeInferenceFailure("Cannot deduce type of constant '%s'" % expr) - - def map_subscript(self, expr): - return self.rec(expr.aggregate) - - def map_linear_subscript(self, expr): - return self.rec(expr.aggregate) - - def map_call(self, expr): - from pymbolic.primitives import Variable - - identifier = expr.function - if isinstance(identifier, Variable): - identifier = identifier.name - - arg_dtypes = tuple(self.rec(par) for par in expr.parameters) - - mangle_result = self.kernel.mangle_function(identifier, arg_dtypes) - if mangle_result is not None: - return mangle_result[0] +from loopy.expression import dtype_to_type_context, TypeInferenceMapper - raise RuntimeError("no type inference information on " - "function '%s'" % identifier) - - def map_variable(self, expr): - if expr.name in self.kernel.all_inames(): - return self.kernel.index_dtype - - result = self.kernel.mangle_symbol(expr.name) - if result is not None: - result_dtype, _ = result - return result_dtype - - obj = self.new_assignments.get(expr.name) - - if obj is None: - obj = self.kernel.arg_dict.get(expr.name) - - if obj is None: - obj = self.kernel.temporary_variables.get(expr.name) - - if obj is None: - raise TypeInferenceFailure("name not known in type inference: %s" - % expr.name) - - from loopy.kernel.data import TemporaryVariable, KernelArgument - import loopy as lp - if isinstance(obj, TemporaryVariable): - result = obj.dtype - if result is lp.auto: - raise DependencyTypeInferenceFailure( - "temporary variable '%s'" % expr.name, - expr.name) - else: - return result - - elif isinstance(obj, KernelArgument): - result = obj.dtype - if result is None: - raise DependencyTypeInferenceFailure( - "argument '%s'" % expr.name, - expr.name) - else: - return result - - else: - raise RuntimeError("unexpected type inference " - "object type for '%s'" % expr.name) - - map_tagged_variable = map_variable - - def map_lookup(self, expr): - agg_result = self.rec(expr.aggregate) - dtype, offset = agg_result.fields[expr.name] - return dtype - - def map_comparison(self, expr): - # "bool" is unusable because OpenCL's bool has indeterminate memory - # format. - return np.dtype(np.int32) - - map_logical_not = map_comparison - map_logical_and = map_comparison - map_logical_or = map_comparison - - def map_reduction(self, expr): - return expr.operation.result_dtype( - self.kernel.target, self.rec(expr.expr), expr.inames) - -# }}} - - -# {{{ C code mapper - -# type_context may be: -# - 'i' for integer - -# - 'f' for single-precision floating point -# - 'd' for double-precision floating point -# or None for 'no known context'. - -def dtype_to_type_context(target, dtype): - dtype = np.dtype(dtype) - - if dtype.kind == 'i': - return 'i' - if dtype in [np.float64, np.complex128]: - return 'd' - if dtype in [np.float32, np.complex64]: - return 'f' - if target.is_vector_dtype(dtype): - return dtype_to_type_context(target, dtype.fields["x"][0]) - - return None +from loopy.tools import is_integer def get_opencl_vec_member(idx): @@ -273,25 +46,7 @@ def get_opencl_vec_member(idx): return "s%s" % hex(int(idx))[2:] -class SeenFunction(Record): - """ - .. attribute:: name - .. attribute:: c_name - .. attribute:: arg_dtypes - - a tuple of arg dtypes - """ - - def __init__(self, name, c_name, arg_dtypes): - Record.__init__(self, - name=name, - c_name=c_name, - arg_dtypes=arg_dtypes) - - def __hash__(self): - return hash((type(self),) - + tuple((f, getattr(self, f)) for f in type(self).fields)) - +# {{{ C code mapper class LoopyCCodeMapper(RecursiveMapper): def __init__(self, kernel, seen_dtypes, seen_functions, var_subst_map={}, @@ -554,6 +309,7 @@ class LoopyCCodeMapper(RecursiveMapper): def seen_func(name): idt = self.kernel.index_dtype + from loopy.codegen import SeenFunction self.seen_functions.add(SeenFunction(name, name, (idt, idt))) if den_nonneg: @@ -675,6 +431,7 @@ class LoopyCCodeMapper(RecursiveMapper): "for function '%s' not understood" % identifier) + from loopy.codegen import SeenFunction self.seen_functions.add(SeenFunction(identifier, c_name, par_dtypes)) if str_parameters is None: # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte new file mode 160000 index 0000000000000000000000000000000000000000..fb6ba114d9d906403d47b0aaf69e2fe4cef382f2 --- /dev/null +++ b/loopy/target/c/compyte @@ -0,0 +1 @@ +Subproject commit fb6ba114d9d906403d47b0aaf69e2fe4cef382f2 diff --git a/loopy/target/opencl/__init__.py b/loopy/target/opencl/__init__.py index 04efdedabb6e281b99c8ffae3a5be53e084524ed..e68040c0983d5b6f144ae2b42eaa41899ef56b76 100644 --- a/loopy/target/opencl/__init__.py +++ b/loopy/target/opencl/__init__.py @@ -26,44 +26,21 @@ THE SOFTWARE. import numpy as np -from loopy.target import TargetBase - - -# {{{ type registry - -def _register_types(): - from loopy.target.opencl.compyte.dtypes import ( - _fill_dtype_registry, get_or_register_dtype) - import struct - - _fill_dtype_registry(respect_windows=False, include_bool=False) - - # complex number support left out - - is_64_bit = struct.calcsize('@P') * 8 == 64 - if not is_64_bit: - get_or_register_dtype( - ["unsigned long", "unsigned long int"], np.uint64) - get_or_register_dtype( - ["signed long", "signed long int", "long int"], np.int64) - -_register_types() - -# }}} +from loopy.target.c import CTarget +from pytools import memoize_method # {{{ vector types -class vec: +class vec: # noqa pass def _create_vector_types(): field_names = ["x", "y", "z", "w"] - from loopy.target.opencl.compyte.dtypes import get_or_register_dtype - vec.types = {} + vec.names_and_dtypes = [] vec.type_to_scalar_and_count = {} counts = [2, 3, 4, 8, 16] @@ -109,40 +86,20 @@ def _create_vector_types(): dtype = np.dtype([(n, base_type) for (n, title) in zip(names, titles)]) - get_or_register_dtype(name, dtype) - setattr(vec, name, dtype) - def create_array(dtype, count, padded_count, *args, **kwargs): - if len(args) < count: - from warnings import warn - warn("default values for make_xxx are deprecated;" - " instead specify all parameters or use" - " array.vec.zeros_xxx", DeprecationWarning) - padded_args = tuple(list(args)+[0]*(padded_count-len(args))) - array = eval("array(padded_args, dtype=dtype)", - dict(array=np.array, padded_args=padded_args, - dtype=dtype)) - for key, val in kwargs.items(): - array[key] = val - return array - - setattr(vec, "make_"+name, staticmethod(eval( - "lambda *args, **kwargs: create_array(dtype, %i, %i, " - "*args, **kwargs)" % (count, padded_count), - dict(create_array=create_array, dtype=dtype)))) - setattr(vec, "filled_"+name, staticmethod(eval( - "lambda val: vec.make_%s(*[val]*%i)" % (name, count)))) - setattr(vec, "zeros_"+name, - staticmethod(eval("lambda: vec.filled_%s(0)" % (name)))) - setattr(vec, "ones_"+name, - staticmethod(eval("lambda: vec.filled_%s(1)" % (name)))) + vec.names_and_dtypes.append((name, dtype)) vec.types[np.dtype(base_type), count] = dtype vec.type_to_scalar_and_count[dtype] = np.dtype(base_type), count _create_vector_types() + +def _register_vector_types(dtype_registry): + for name, dtype in vec.names_and_dtypes: + dtype_registry.get_or_register_dtype(name, dtype) + # }}} @@ -234,7 +191,7 @@ def opencl_preamble_generator(target, seen_dtypes, seen_functions): # {{{ target -class OpenCLTarget(TargetBase): +class OpenCLTarget(CTarget): def function_manglers(self): return ( super(OpenCLTarget, self).function_manglers() + [ @@ -255,13 +212,24 @@ class OpenCLTarget(TargetBase): reduction_preamble_generator ]) - def get_or_register_dtype(self, names, dtype=None): - from loopy.target.opencl.compyte.dtypes import get_or_register_dtype - return get_or_register_dtype(names, dtype) + @memoize_method + def get_dtype_registry(self): + from loopy.target.c.compyte import ( + DTypeRegistry, fill_with_registry_with_c_types) + result = DTypeRegistry() + fill_with_registry_with_c_types(result) + + # complex number support left out - def dtype_to_typename(self, dtype): - from loopy.target.opencl.compyte.dtypes import dtype_to_ctype - return dtype_to_ctype(dtype) + # CL defines 'long' as 64-bit + result.get_or_register_dtype( + ["unsigned long", "unsigned long int"], np.uint64) + result.get_or_register_dtype( + ["signed long", "signed long int", "long int"], np.int64) + + _register_vector_types(result) + + return result def is_vector_dtype(self, dtype): return list(vec.types.values()) @@ -269,6 +237,43 @@ class OpenCLTarget(TargetBase): def get_vector_dtype(self, base, count): return vec.types[base, count] + def wrap_function_declaration(self, kernel, fdecl): + from cgen.opencl import CLKernel, CLRequiredWorkGroupSize + return CLRequiredWorkGroupSize( + kernel.get_grid_sizes_as_exprs()[1], + CLKernel(fdecl)) + + def generate_code(self, kernel, codegen_state, impl_arg_info): + code, implemented_domains = ( + super(OpenCLTarget, self).generate_code( + kernel, codegen_state, impl_arg_info)) + + from loopy.tools import remove_common_indentation + code = ( + remove_common_indentation(""" + #define lid(N) ((%(idx_ctype)s) get_local_id(N)) + #define gid(N) ((%(idx_ctype)s) get_group_id(N)) + """ % dict(idx_ctype=self.dtype_to_typename(kernel.index_dtype))) + + "\n\n" + + code) + + return code, implemented_domains + + def generate_body(self, kernel, codegen_state): + body, implemented_domains = ( + super(OpenCLTarget, self).generate_body(kernel, codegen_state)) + + from loopy.kernel.data import ImageArg + + if any(isinstance(arg, ImageArg) for arg in kernel.args): + from cgen import Value, Const, Initializer + body.contents.insert(0, + Initializer(Const(Value("sampler_t", "loopy_sampler")), + "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP " + "| CLK_FILTER_NEAREST")) + + return body, implemented_domains + # }}} # vim: foldmethod=marker diff --git a/loopy/target/opencl/compyte b/loopy/target/opencl/compyte deleted file mode 160000 index 5d54e1b2b7f28d3e779029ac0b4aa5f957829f23..0000000000000000000000000000000000000000 --- a/loopy/target/opencl/compyte +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 5d54e1b2b7f28d3e779029ac0b4aa5f957829f23 diff --git a/loopy/target/pyopencl/__init__.py b/loopy/target/pyopencl/__init__.py index a1b323d7cf0deaef308952d87b1523b0a020ad68..a0a119dce42e1095ff327f38f6b836f80091b159 100644 --- a/loopy/target/pyopencl/__init__.py +++ b/loopy/target/pyopencl/__init__.py @@ -259,13 +259,9 @@ class PyOpenCLTarget(OpenCLTarget): def pre_codegen_check(self, kernel): check_sizes(kernel, self.device) - def get_or_register_dtype(self, names, dtype=None): - from pyopencl.compyte.dtypes import get_or_register_dtype - return get_or_register_dtype(names, dtype) - - def dtype_to_typename(self, dtype): - from pyopencl.compyte.dtypes import dtype_to_ctype - return dtype_to_ctype(dtype) + def get_dtype_registry(self): + from pyopencl.compyte.dtypes import TYPE_REGISTRY + return TYPE_REGISTRY def is_vector_dtype(self, dtype): from pyopencl.array import vec diff --git a/loopy/tools.py b/loopy/tools.py index 0c8898bb1ce20a732dbd0d64a679e18c98609edb..1f47160677c5255539a3a6c098c9ba6d8b89a18b 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -98,6 +98,8 @@ class LoopyKeyBuilder(KeyBuilderBase): # }}} +# {{{ picklable dtype + class PicklableDtype(object): """This object works around several issues with pickling :class:`numpy.dtype` objects. It does so by serving as a picklable wrapper around the original @@ -157,4 +159,38 @@ class PicklableDtype(object): def assert_has_target(self): assert self.target is not None +# }}} + + +# {{{ remove common indentation + +def remove_common_indentation(code): + if "\n" not in code: + return code + + # accommodate pyopencl-ish syntax highlighting + code = code.lstrip("//CL//") + + if not code.startswith("\n"): + return code + + lines = code.split("\n") + while lines[0].strip() == "": + lines.pop(0) + while lines[-1].strip() == "": + lines.pop(-1) + + if lines: + base_indent = 0 + while lines[0][base_indent] in " \t": + base_indent += 1 + + for line in lines[1:]: + if line[:base_indent].strip(): + raise ValueError("inconsistent indentation") + + return "\n".join(line[base_indent:] for line in lines) + +# }}} + # vim: foldmethod=marker diff --git a/requirements.txt b/requirements.txt index 708b72c8d508c979f069d41b645da18b559ecfd5..b833d6dd0d989ad3b88aedb3d09cf21766536613 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,6 @@ git+git://github.com/pyopencl/pyopencl git+git://github.com/inducer/pymbolic hg+https://bitbucket.org/inducer/f2py + +# Optional, needed for using the C preprocessor on Fortran +ply>=3.6 diff --git a/setup.cfg b/setup.cfg index 6faef2e65abe138f9ab22c94452c43be0e52c075..d3f13a0e64b79c00a957cb1369e335e0b8a00d76 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,3 @@ [flake8] -ignore = E126,E127,E128,E123,E226,E241,E242,E265 +ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802 max-line-length=85 diff --git a/test/test_loopy.py b/test/test_loopy.py index f99c0b3b3638d024530c339e4340e27d71043c4e..2173347cadaea66a4ec53d78fb8c77abb24cc7bb 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1958,6 +1958,112 @@ def test_auto_test_can_detect_problems(ctx_factory): parameters=dict(n=123)) +def test_generate_c_snippet(): + from loopy.target.c import CTarget + + from pymbolic import var + I = var("I") # noqa + f = var("f") + df = var("df") + q_v = var("q_v") + eN = var("eN") # noqa + k = var("k") + u = var("u") + + from functools import partial + l_sum = partial(lp.Reduction, "sum") + + Instr = lp.ExpressionInstruction # noqa + + knl = lp.make_kernel( + "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", + [ + Instr(f[I], l_sum(k, q_v[k, I]*u)), + Instr(df[I], l_sum(k, q_v[k, I])), + ], + [ + lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"), + lp.GlobalArg("f,df", np.float64, shape="nSpace"), + lp.ValueArg("u", np.float64), + "...", + ], + target=CTarget(), + assumptions="nQuad>=1") + + if 0: # enable to play with prefetching + # (prefetch currently requires constant sizes) + knl = lp.fix_parameters(knl, nQuad=5, nSpace=3) + knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None) + + knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) + knl = lp.set_loop_priority(knl, "I,k_outer,k_inner") + + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + print(lp.generate_body(knl)) + + +def test_precompute_with_preexisting_inames(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}", + """ + result[e,i] = sum(j, D1[i,j]*u[e,j]) + result2[e,i] = sum(k, D2[i,k]*u[e,k]) + """) + + knl = lp.add_and_infer_dtypes(knl, { + "u": np.float32, + "D1": np.float32, + "D2": np.float32, + }) + + knl = lp.fix_parameters(knl, n=13) + + ref_knl = knl + + knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") + knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") + + knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", + precompute_inames="ii,jj") + knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for", + precompute_inames="ii,jj") + + knl = lp.set_loop_priority(knl, "ii,jj,e,j,k") + + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(E=200)) + + +def test_precompute_with_preexisting_inames_fail(): + knl = lp.make_kernel( + "{[e,i,j,k]: 0<=e<E and 0<=i,j<n and 0<=k<2*n}", + """ + result[e,i] = sum(j, D1[i,j]*u[e,j]) + result2[e,i] = sum(k, D2[i,k]*u[e,k]) + """) + + knl = lp.add_and_infer_dtypes(knl, { + "u": np.float32, + "D1": np.float32, + "D2": np.float32, + }) + + knl = lp.fix_parameters(knl, n=13) + + knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") + knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") + + knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", + precompute_inames="ii,jj") + with pytest.raises(lp.LoopyError): + lp.precompute(knl, "D2_subst", "i,k", default_tag="for", + precompute_inames="ii,jj") + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])