diff --git a/loopy/__init__.py b/loopy/__init__.py index 92b7fca77a85d5bf4531100040ab9b8e772d8a8c..642cad183f588b09ede2af97370ca6855a1da90d 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -139,7 +139,8 @@ from loopy.frontend.fortran import (c_preprocess, parse_transformed_fortran, parse_fortran) from loopy.target import TargetBase, ASTBuilderBase -from loopy.target.c import CTarget, ExecutableCTarget, generate_header +from loopy.target.c import (CTarget, ExecutableCTarget, generate_header, + OpenMPTarget) from loopy.target.cuda import CudaTarget from loopy.target.opencl import OpenCLTarget from loopy.target.pyopencl import PyOpenCLTarget @@ -262,7 +263,7 @@ __all__ = [ "TargetBase", "CTarget", "ExecutableCTarget", "generate_header", - "CudaTarget", "OpenCLTarget", + "CudaTarget", "OpenCLTarget", "OpenMPTarget", "PyOpenCLTarget", "ISPCTarget", "NumbaTarget", "NumbaCudaTarget", "ASTBuilderBase", diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index c946e09a086e574a2593d60f652a81773d95a1fe..b7e7d70852cce53a1b1425372ef3159ccf425b9a 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -94,9 +94,13 @@ def get_usable_inames_for_conditional(kernel, sched_index): # # - ILP indices are not available in loop bounds, they only get defined # at the innermost level of nesting. + # + # - Target (e.g. OpenMP) would generate explicit loops for hardware + # concurrent inames. if ( kernel.iname_tags_of_type(iname, ConcurrentTag) + and not kernel.target.needs_explicit_hw_concurrent_loops() and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase) and crosses_barrier) and not kernel.iname_tags_of_type(iname, IlpBaseTag) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 966fd1e07df7adb446f96ff5d6b74d4021f48b35..d49b9a4071e5da640ab7e650978f3b731acfcb01 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -136,8 +136,8 @@ def generate_code_for_sched_index(codegen_state, sched_index): generate_vectorize_loop, generate_sequential_loop_dim_code) - from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, - ForceSequentialTag, LoopedIlpTag, VectorizeTag, + from loopy.kernel.data import (UnrolledIlpTag, HardwareConcurrentTag, + ForceSequentialTag, LoopedIlpTag, VectorizeTag, UnrollTag, InOrderSequentialSequentialTag, filter_iname_tags_by_type) if filter_iname_tags_by_type(tags, (UnrollTag, UnrolledIlpTag)): func = generate_unroll_loop @@ -146,6 +146,10 @@ def generate_code_for_sched_index(codegen_state, sched_index): elif not tags or filter_iname_tags_by_type(tags, (LoopedIlpTag, ForceSequentialTag, InOrderSequentialSequentialTag)): func = generate_sequential_loop_dim_code + elif filter_iname_tags_by_type(tags, HardwareConcurrentTag, max_num=1): + from loopy.target.c import OpenMPTarget + assert isinstance(kernel.target, OpenMPTarget) + func = generate_sequential_loop_dim_code else: raise RuntimeError("encountered (invalid) EnterLoop " "for '%s', tagged '%s'" diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 9b746bb99866ee933f4bb4e9597e6e719fbf8a8e..87e6f9a8f72dd81950ae19856262ccfd32b66168 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -217,6 +217,8 @@ def parse_tag(tag): if tag == "for": return None + if tag == "forceseq": + return ForceSequentialTag() elif tag == "ord": return InOrderSequentialSequentialTag() elif tag in ["unr"]: diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 440ac22cb890bd9f1b47f909ee96681c39c33975..4f1bb44efbbc39a1c7ae9298b8ae74dafd495421 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1875,7 +1875,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}): for insn_id in sched_item_to_insn_id(item)) from loopy.kernel.data import (IlpBaseTag, ConcurrentTag, VectorizeTag, - filter_iname_tags_by_type) + filter_iname_tags_by_type, HardwareConcurrentTag) ilp_inames = set( iname for iname, tags in six.iteritems(kernel.iname_to_tags) @@ -1889,6 +1889,14 @@ def generate_loop_schedules_inner(kernel, debug_args={}): for iname, tags in six.iteritems(kernel.iname_to_tags) if filter_iname_tags_by_type(tags, ConcurrentTag)) + if kernel.target.needs_explicit_hw_concurrent_loops(): + # need to generate explicit loops for targets such as OpenMP + parallel_inames -= set( + iname for iname, tags in six.iteritems(kernel.iname_to_tags) + if filter_iname_tags_by_type(tags, HardwareConcurrentTag) + # TODO: should this be AxisTag instead? + ) + loop_nest_with_map = find_loop_nest_with_map(kernel) loop_nest_around_map = find_loop_nest_around_map(kernel) sched_state = SchedulerState( diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index a08b406f53798b4f7f6852a4f424182a75b224e4..5f981c0f361da7f93c5abb6770e97d8e1e38bffd 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -117,6 +117,9 @@ class TargetBase(object): def is_vector_dtype(self, dtype): raise NotImplementedError() + def needs_explicit_hw_concurrent_loops(self): + raise NotImplementedError() + def vector_dtype(self, base, count): raise NotImplementedError() diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 8e69793e8079864a7e4c3117f267a20d6db3962f..9f4d44c54a09d7e8ea9170a9941f903d47f88cde 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -288,6 +288,9 @@ class CTarget(TargetBase): def is_vector_dtype(self, dtype): return False + def needs_explicit_hw_concurrent_loops(self): + return False + def get_vector_dtype(self, base, count): raise KeyError() @@ -331,6 +334,33 @@ class ExecutableCTarget(CTarget): # }}} +# {{{ OpenMP target + +class OpenMPTarget(CTarget): + """A target for the OpenMP programming model. + """ + + def __init__(self): + super(OpenMPTarget, self).__init__() + + def needs_explicit_hw_concurrent_loops(self): + return True + + def pre_codegen_check(self, kernel): + from loopy.kernel.data import AxisTag + for i, tags in kernel.iname_to_tags.items(): + for tag in tags: + if isinstance(tag, AxisTag): + if tag.axis != 0: + raise LoopyError( + "Only axis=0 allowed for OpenMP hardware tags.") + + def get_device_ast_builder(self): + return OpenMPCASTBuilder(self) + +# }}} + + class _ConstRestrictPointer(Pointer): def get_decl_pair(self): sub_tp, sub_decl = self.subdecl.get_decl_pair() @@ -969,6 +999,55 @@ class CASTBuilder(ASTBuilderBase): return node +# {{{ OpenMP target AST builder + +class OpenMPCASTBuilder(CASTBuilder): + + # {{{ code generation guts + + def get_expression_to_c_expression_mapper(self, codegen_state): + from loopy.target.c.codegen.expression import ExprToOpenMPCExprMapper + return ExprToOpenMPCExprMapper(codegen_state) + + def emit_sequential_loop(self, codegen_state, iname, iname_dtype, + lbound, ubound, inner): + """mark inames tagged as "l.0" with #pragma omp simd, and "g.0" with + #pragma parallel for. + """ + + from loopy.kernel.data import (filter_iname_tags_by_type, LocalIndexTag, + GroupIndexTag) + + loop = super(OpenMPCASTBuilder, self).emit_sequential_loop( + codegen_state, iname, iname_dtype, lbound, ubound, inner) + + tags = filter_iname_tags_by_type( + codegen_state.kernel.iname_to_tags[iname], + (LocalIndexTag, GroupIndexTag), 1) + if tags: + tag, = tags + from cgen import Block, Pragma + if isinstance(tag, LocalIndexTag) and tag.axis == 0: + loop = Block([Pragma("omp simd"), loop]) + elif isinstance(tag, GroupIndexTag) and tag.axis == 0: + loop = Block([Pragma("omp parallel for"), loop]) + + return loop + + def emit_barrier(self, synchronization_kind, mem_kind, comment): + if synchronization_kind == "local": + return + elif synchronization_kind == "global": + from cgen import Pragma + return Pragma("omp barrier") + else: + raise LoopyError("unknown barrier kind") + + # }}} + +# }}} + + # {{{ header generation class CFunctionDeclExtractor(CASTIdentityMapper): diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 59ed77f9c17fa04d67e251c22bec88fc8b15936c..15fe526ef94a25b1a6e551c13f641815b7f1dfa3 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -791,7 +791,6 @@ class CExpressionToCodeMapper(RecursiveMapper): def map_comparison(self, expr, enclosing_prec): from pymbolic.mapper.stringifier import PREC_COMPARISON - return self.parenthesize_if_needed( "%s %s %s" % ( self.rec(expr.left, PREC_COMPARISON), @@ -897,4 +896,22 @@ class CExpressionToCodeMapper(RecursiveMapper): # }}} + +# {{{ expression to OpenMP expression mapper + +class ExprToOpenMPCExprMapper(ExpressionToCExpressionMapper): + + def map_variable(self, expr, type_context): + if expr.name in self.codegen_state.var_subst_map: + if not self.kernel.options.annotate_inames: + hardware_index = self.codegen_state.var_subst_map[expr.name] + from loopy.symbolic import HardwareAxisIndex + if isinstance(hardware_index, HardwareAxisIndex): + return var(expr.name) + + return super(ExprToOpenMPCExprMapper, self).map_variable(expr, type_context) + +# }}} + + # vim: fdm=marker diff --git a/loopy/target/numba.py b/loopy/target/numba.py index 6946063ee04f52a4890344b4cbff9446bacb6923..db86a37df563e8850c7e0868dd6a8b1f3177f0c6 100644 --- a/loopy/target/numba.py +++ b/loopy/target/numba.py @@ -121,6 +121,9 @@ class NumbaTarget(TargetBase): def get_vector_dtype(self, base, count): raise KeyError() + def needs_explicit_hw_concurrent_loops(self): + return False + def get_or_register_dtype(self, names, dtype=None): # These kind of shouldn't be here. return self.get_dtype_registry().get_or_register_dtype(names, dtype) @@ -187,6 +190,9 @@ class NumbaCudaTarget(TargetBase): def get_device_ast_builder(self): return NumbaCudaASTBuilder(self) + def needs_explicit_hw_concurrent_loops(self): + return False + # {{{ types @memoize_method diff --git a/test/test_target.py b/test/test_target.py index 7c0d003ee9e3730c6c754963c2e3d5e033298c51..df6faef363e6930f9ec5482528a6ce6c2119b725 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -103,6 +103,27 @@ def test_cuda_target(): lp.preprocess_kernel(knl)))[0]) +def test_openmp_target(): + from loopy.target.c import OpenMPTarget + + knl = lp.make_kernel( + "{ [i]: 0 <= i < 1024 }", + "out[i] = 2*a[i]", + [ + lp.GlobalArg("out,a", np.float32, shape=lp.auto), + "..." + ], + target=OpenMPTarget()) + + knl = lp.split_iname(knl, "i", 4, inner_tag="ilp.seq") + knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"}) + + print( + lp.generate_code( + lp.get_one_scheduled_kernel( + lp.preprocess_kernel(knl)))[0]) + + def test_generate_c_snippet(): from pymbolic import var I = var("I") # noqa