diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst index d085c1215781f52a8f0c79359e8ac7061d94ffb7..386fbc18a2c5089486aba51395d671b4c9a600ff 100644 --- a/doc/ref_transform.rst +++ b/doc/ref_transform.rst @@ -112,7 +112,10 @@ Finishing up .. autofunction:: get_one_scheduled_kernel -.. autofunction:: generate_code +.. autoclass:: GeneratedProgram +.. autoclass:: CodeGenerationResult + +.. autofunction:: generate_code_v2 Setting options --------------- diff --git a/doc/tutorial.rst b/doc/tutorial.rst index cde36163d09a5a7d31a1afc97e0bfcaa636cdda3..f6e7ad9c2211d24582e5027777b584fc5ac64d98 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -721,7 +721,7 @@ those for us: .. doctest:: - >>> glob, loc = knl.get_grid_sizes() + >>> glob, loc = knl.get_grid_size_upper_bounds() >>> print(glob) (Aff("[n] -> { [(floor((127 + n)/128))] }"),) >>> print(loc) diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py index ecb45edbf0af51bdcbb1efcbbee8e18e0f3b500f..b71c561195647b7e35bc5538cfd1d601e270f3c2 100644 --- a/examples/python/global_barrier_removal.py +++ b/examples/python/global_barrier_removal.py @@ -5,14 +5,16 @@ import pyopencl.array knl = lp.make_kernel( "{ [i,k]: 0<=i<n and 0<=k<3 }", - """c[k,i] = a[k, i + 1] - out[k,i] = c[k,i]""", - ["..."]) + """ + c[k,i] = a[k, i + 1] + out[k,i] = c[k,i] + """) # transform knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") from loopy.kernel.tools import add_dtypes -knl = add_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) +knl = add_dtypes(knl, + {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) # schedule from loopy.preprocess import preprocess_kernel @@ -20,9 +22,11 @@ knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel knl = get_one_scheduled_kernel(knl) -print(knl) # map schedule onto host or device -from loopy.codegen.device_mapping import map_schedule_onto_host_or_device -knl = map_schedule_onto_host_or_device(knl) print(knl) + +cgr = lp.generate_code_v2(knl) + +print(cgr.device_code()) +print(cgr.host_code()) diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py index a402896c85baa49bc23bb5d770607cf73641c273..fa581d4262e2f06addf81aeaecca5ed2f8f8c8f1 100644 --- a/examples/python/ispc-stream-harness.py +++ b/examples/python/ispc-stream-harness.py @@ -31,9 +31,9 @@ def transform(knl, vars, stream_dtype): def gen_code(knl): knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) - ispc_code, arg_info = lp.generate_code(knl) + codegen_result = lp.generate_code_v2(knl) - return ispc_code + return codegen_result.device_code() + "\n" + codegen_result.host_code() NRUNS = 10 diff --git a/loopy/__init__.py b/loopy/__init__.py index 10bd1100126e79ff2451eaf96c45bd4be47adf3a..8562df4118534392cf66ada250aacdfdbc0e1917 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -110,7 +110,11 @@ from loopy.statistics import (get_op_poly, sum_ops_to_dtypes, get_DRAM_access_poly, get_barrier_poly, stringify_stats_mapping, sum_mem_access_to_bytes, gather_access_footprints, gather_access_footprint_bytes) -from loopy.codegen import generate_code, generate_body +from loopy.codegen import ( + generate_code, generate_code_v2, generate_body) +from loopy.codegen.result import ( + GeneratedProgram, + CodeGenerationResult) from loopy.compiled import CompiledKernel from loopy.options import Options from loopy.auto_test import auto_test_vs_ref @@ -196,7 +200,8 @@ __all__ = [ "preprocess_kernel", "realize_reduction", "infer_unknown_types", "generate_loop_schedules", "get_one_scheduled_kernel", - "generate_code", "generate_body", + "GeneratedProgram", "CodeGenerationResult", + "generate_code", "generate_code_v2", "generate_body", "get_op_poly", "sum_ops_to_dtypes", "get_gmem_access_poly", "get_DRAM_access_poly", diff --git a/loopy/auto_test.py b/loopy/auto_test.py index f34a943831c68bd0d2cf6089d8b70ad4bc43022e..0adf4416d44e199f1b329205a3159736acfa3fa3 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -442,7 +442,8 @@ def auto_test_vs_ref( try: ref_args, ref_arg_data = \ - make_ref_args(ref_sched_kernel, ref_cl_kernel_info.impl_arg_info, + make_ref_args(ref_sched_kernel, + ref_cl_kernel_info.implemented_data_info, ref_queue, parameters) ref_args["out_host"] = False except cl.RuntimeError as e: @@ -529,7 +530,8 @@ def auto_test_vs_ref( if args is None: cl_kernel_info = compiled.cl_kernel_info(frozenset()) - args = make_args(kernel, cl_kernel_info.impl_arg_info, + args = make_args(kernel, + cl_kernel_info.implemented_data_info, queue, ref_arg_data, parameters) args["out_host"] = False diff --git a/loopy/check.py b/loopy/check.py index 5ae22cac5e5af8b727ae0ba577fc05a8e571b8a7..0ef3d27cfeea142e905dbb38aaaad9afbe06ebcf 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -60,7 +60,8 @@ def check_loop_priority_inames_known(kernel): def check_for_unused_hw_axes_in_insns(kernel): - group_size, local_size = kernel.get_grid_sizes_as_exprs() + # FIXME: This could be made specific to the current kernel piece. + group_size, local_size = kernel.get_grid_size_upper_bounds_as_exprs() group_axes = set(ax for ax, length in enumerate(group_size)) local_axes = set(ax for ax, length in enumerate(local_size)) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 4056c0dea785a6d27225a4fd2b3337c9c9a14db9..ec0b39835de4da3df95d013ccdfb0417d0ab2b91 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -36,111 +36,95 @@ import logging logger = logging.getLogger(__name__) -# {{{ support code for AST wrapper objects - -class GeneratedInstruction(Record): - """Objects of this type are wrapped around ASTs upon - return from generation calls to collect information about them. - - :ivar implemented_domains: A map from an insn id to a list of - implemented domains, with the purpose of checking that - each instruction's exact iteration space has been covered. - """ - __slots__ = ["insn_id", "implemented_domain", "ast"] - - -class GeneratedCode(Record): - """Objects of this type are wrapped around ASTs upon - return from generation calls to collect information about them. +# {{{ implemented data info - :ivar implemented_domains: A map from an insn id to a list of - implemented domains, with the purpose of checking that - each instruction's exact iteration space has been covered. +class ImplementedDataInfo(Record): """ - __slots__ = ["ast", "implemented_domains"] - - -def gen_code_block(elements): - from cgen import Block, Comment, Line, Initializer - - block_els = [] - implemented_domains = {} + .. attribute:: name - for el in elements: - if isinstance(el, GeneratedCode): - for insn_id, idoms in six.iteritems(el.implemented_domains): - implemented_domains.setdefault(insn_id, []).extend(idoms) + The expanded name of the array. Note that, for example + in the case of separate-array-tagged axes, multiple + implemented arrays may correspond to one user-facing + array. - if isinstance(el.ast, Block): - block_els.extend(el.ast.contents) - else: - block_els.append(el.ast) + .. attribute:: dtype - elif isinstance(el, Initializer): - block_els.append(el) + .. attribute:: arg_class - elif isinstance(el, Comment): - block_els.append(el) + .. attribute:: base_name - elif isinstance(el, Line): - assert not el.text - block_els.append(el) + The user-facing name of the underlying array. + May be *None* for non-array arguments. - elif isinstance(el, GeneratedInstruction): - block_els.append(el.ast) - if el.implemented_domain is not None: - implemented_domains.setdefault(el.insn_id, []).append( - el.implemented_domain) + .. attribute:: shape + .. attribute:: strides - else: - raise ValueError("unrecognized object of type '%s' in block" - % type(el)) + Strides in multiples of ``dtype.itemsize``. - if len(block_els) == 1: - ast, = block_els - else: - ast = Block(block_els) + .. attribute:: unvec_shape + .. attribute:: unvec_strides - return GeneratedCode(ast=ast, implemented_domains=implemented_domains) + Strides in multiples of ``dtype.itemsize`` that accounts for + :class:`loopy.kernel.array.VectorArrayDimTag` in a scalar + manner -def wrap_in(cls, *args): - inner = args[-1] - args = args[:-1] + .. attribute:: offset_for_name + .. attribute:: stride_for_name_and_axis - if not isinstance(inner, GeneratedCode): - raise ValueError("unrecognized object of type '%s' in block" - % type(inner)) + A tuple *(name, axis)* indicating the (implementation-facing) + name of the array and axis number for which this argument provides + the strides. - args = args + (inner.ast,) + .. attribute:: allows_offset + .. attribute:: is_written + """ - return GeneratedCode(ast=cls(*args), - implemented_domains=inner.implemented_domains) + def __init__(self, target, name, dtype, arg_class, + base_name=None, + shape=None, strides=None, + unvec_shape=None, unvec_strides=None, + offset_for_name=None, stride_for_name_and_axis=None, + allows_offset=None, + is_written=None): + from loopy.types import LoopyType + assert isinstance(dtype, LoopyType) -def wrap_in_if(condition_codelets, inner): - from cgen import If + Record.__init__(self, + name=name, + dtype=dtype, + arg_class=arg_class, + base_name=base_name, + shape=shape, + strides=strides, + unvec_shape=unvec_shape, + unvec_strides=unvec_strides, + offset_for_name=offset_for_name, + stride_for_name_and_axis=stride_for_name_and_axis, + allows_offset=allows_offset, + is_written=is_written) - if condition_codelets: - return wrap_in(If, - "\n&& ".join(condition_codelets), - inner) +# }}} - return inner +# {{{ code generation state -def add_comment(cmt, code): - if cmt is None: - return code +class Unvectorizable(Exception): + pass - from cgen import add_comment - assert isinstance(code, GeneratedCode) - return GeneratedCode( - ast=add_comment(cmt, code.ast), - implemented_domains=code.implemented_domains) +class VectorizationInfo(object): + """ + .. attribute:: iname + .. attribute:: length + .. attribute:: space + """ -# }}} + def __init__(self, iname, length, space): + self.iname = iname + self.length = length + self.space = space class SeenFunction(Record): @@ -163,28 +147,13 @@ class SeenFunction(Record): + tuple((f, getattr(self, f)) for f in type(self).fields)) -# {{{ code generation state - -class Unvectorizable(Exception): - pass - - -class VectorizationInfo(object): - """ - .. attribute:: iname - .. attribute:: length - .. attribute:: space - """ - - def __init__(self, iname, length, space): - self.iname = iname - self.length = length - self.space = space - - class CodeGenerationState(object): """ .. attribute:: kernel + .. attribute:: implemented_data_info + + a list of :class:`ImplementedDataInfo` objects. + .. attribute:: implemented_domain The entire implemented domain (as an :class:`islpy.Set`) @@ -212,13 +181,28 @@ class CodeGenerationState(object): .. attribute:: vectorization_info None or an instance of :class:`VectorizationInfo` + + .. attribute:: is_generating_device_code + + .. attribute:: gen_program_name + + None (indicating that host code is being generated) + or the name of the device program currently being + generated. + + .. attribute:: schedule_index_end """ - def __init__(self, kernel, implemented_domain, implemented_predicates, + def __init__(self, kernel, + implemented_data_info, implemented_domain, implemented_predicates, seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map, allow_complex, - vectorization_info=None, var_name_generator=None): + vectorization_info=None, var_name_generator=None, + is_generating_device_code=None, + gen_program_name=None, + schedule_index_end=None): self.kernel = kernel + self.implemented_data_info = implemented_data_info self.implemented_domain = implemented_domain self.implemented_predicates = implemented_predicates self.seen_dtypes = seen_dtypes @@ -228,11 +212,21 @@ class CodeGenerationState(object): self.allow_complex = allow_complex self.vectorization_info = vectorization_info self.var_name_generator = var_name_generator + self.is_generating_device_code = is_generating_device_code + self.gen_program_name = gen_program_name + self.schedule_index_end = schedule_index_end # {{{ copy helpers - def copy(self, implemented_domain=None, implemented_predicates=frozenset(), - var_subst_map=None, vectorization_info=None): + def copy(self, kernel=None, + implemented_domain=None, implemented_predicates=frozenset(), + var_subst_map=None, vectorization_info=None, + is_generating_device_code=None, + gen_program_name=None, + schedule_index_end=None): + + if kernel is None: + kernel = self.kernel if vectorization_info is False: vectorization_info = None @@ -240,8 +234,18 @@ class CodeGenerationState(object): elif vectorization_info is None: vectorization_info = self.vectorization_info + if is_generating_device_code is None: + is_generating_device_code = self.is_generating_device_code + + if gen_program_name is None: + gen_program_name = self.gen_program_name + + if schedule_index_end is None: + schedule_index_end = self.schedule_index_end + return CodeGenerationState( - kernel=self.kernel, + kernel=kernel, + implemented_data_info=self.implemented_data_info, implemented_domain=implemented_domain or self.implemented_domain, implemented_predicates=( implemented_predicates or self.implemented_predicates), @@ -251,7 +255,10 @@ class CodeGenerationState(object): var_subst_map=var_subst_map or self.var_subst_map, allow_complex=self.allow_complex, vectorization_info=vectorization_info, - var_name_generator=self.var_name_generator) + var_name_generator=self.var_name_generator, + is_generating_device_code=is_generating_device_code, + gen_program_name=gen_program_name, + schedule_index_end=schedule_index_end) def copy_and_assign(self, name, value): """Make a copy of self with variable *name* fixed to *value*.""" @@ -270,10 +277,7 @@ class CodeGenerationState(object): @property def expression_to_code_mapper(self): - # It's kind of unfortunate that this is here, but it's an accident - # of history for now. - - return self.kernel.target.get_expression_to_code_mapper(self) + return self.ast_builder.get_expression_to_code_mapper(self) def intersect(self, other): new_impl, new_other = isl.align_two(self.implemented_domain, other) @@ -334,118 +338,17 @@ class CodeGenerationState(object): for i in range(vinf.length): idx_aff = isl.Aff.zero_on_domain(vinf.space.params()) + i new_codegen_state = novec_self.fix(vinf.iname, idx_aff) - result.append(func(new_codegen_state)) - - return gen_code_block(result) -# }}} - - -# {{{ cgen overrides - -from cgen import Declarator - - -class POD(Declarator): - """A simple declarator: The type is given as a :class:`numpy.dtype` - and the *name* is given as a string. - """ - - def __init__(self, target, dtype, name): - from loopy.types import LoopyType - assert isinstance(dtype, LoopyType) - - self.target = target - self.ctype = target.dtype_to_typename(dtype) - self.dtype = dtype - self.name = name - - def get_decl_pair(self): - return [self.ctype], self.name - - def struct_maker_code(self, name): - return name - - def struct_format(self): - return self.dtype.char - - def alignment_requirement(self): - return self.target.alignment_requirement(self) - - def default_value(self): - return 0 - -# }}} + result.extend(func(new_codegen_state)) + from loopy.codegen.result import merge_codegen_results + return merge_codegen_results(self, result) -# {{{ implemented data info - -class ImplementedDataInfo(Record): - """ - .. attribute:: name - - The expanded name of the array. Note that, for example - in the case of separate-array-tagged axes, multiple - implemented arrays may correspond to one user-facing - array. - - .. attribute:: dtype - .. attribute:: cgen_declarator - - Declarator syntax tree as a :mod:`cgen` object. - - .. attribute:: arg_class - - .. attribute:: base_name - - The user-facing name of the underlying array. - May be *None* for non-array arguments. - - .. attribute:: shape - .. attribute:: strides - - Strides in multiples of ``dtype.itemsize``. - - .. attribute:: unvec_shape - .. attribute:: unvec_strides - - Strides in multiples of ``dtype.itemsize`` that accounts for - :class:`loopy.kernel.array.VectorArrayDimTag` in a scalar - manner - - - .. attribute:: offset_for_name - .. attribute:: stride_for_name_and_axis - - A tuple *(name, axis)* indicating the (implementation-facing) - name of the array and axis number for which this argument provides - the strides. - - .. attribute:: allows_offset - """ - - def __init__(self, target, name, dtype, cgen_declarator, arg_class, - base_name=None, - shape=None, strides=None, - unvec_shape=None, unvec_strides=None, - offset_for_name=None, stride_for_name_and_axis=None, - allows_offset=None): - - from loopy.types import LoopyType - assert isinstance(dtype, LoopyType) - - Record.__init__(self, - name=name, - dtype=dtype, - cgen_declarator=cgen_declarator, - arg_class=arg_class, - base_name=base_name, - shape=shape, - strides=strides, - unvec_shape=unvec_shape, - unvec_strides=unvec_strides, - offset_for_name=offset_for_name, - stride_for_name_and_axis=stride_for_name_and_axis, - allows_offset=allows_offset) + @property + def ast_builder(self): + if self.is_generating_device_code: + return self.kernel.target.get_device_ast_builder() + else: + return self.kernel.target.get_host_ast_builder() # }}} @@ -456,11 +359,10 @@ code_gen_cache = PersistentDict("loopy-code-gen-cache-v3-"+DATA_MODEL_VERSION, # {{{ main code generation entrypoint -def generate_code(kernel, device=None): - if device is not None: - from warnings import warn - warn("passing 'device' to generate_code() is deprecated", - DeprecationWarning, stacklevel=2) +def generate_code_v2(kernel): + """ + :returns: a :class:`CodeGenerationResult` + """ from loopy.kernel import kernel_state if kernel.state == kernel_state.INITIAL: @@ -503,23 +405,24 @@ def generate_code(kernel, device=None): from loopy.kernel.data import ValueArg from loopy.kernel.array import ArrayBase - impl_arg_info = [] + implemented_data_info = [] for arg in kernel.args: + is_written = arg.name in kernel.get_written_variables() if isinstance(arg, ArrayBase): - impl_arg_info.extend( + implemented_data_info.extend( arg.decl_info( kernel.target, - is_written=arg.name in kernel.get_written_variables(), + is_written=is_written, index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): - impl_arg_info.append(ImplementedDataInfo( + implemented_data_info.append(ImplementedDataInfo( target=kernel.target, name=arg.name, dtype=arg.dtype, - cgen_declarator=arg.get_arg_decl(kernel.target), - arg_class=ValueArg)) + arg_class=ValueArg, + is_written=is_written)) else: raise ValueError("argument type not understood: '%s'" % type(arg)) @@ -538,6 +441,7 @@ def generate_code(kernel, device=None): initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( kernel=kernel, + implemented_data_info=implemented_data_info, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), seen_dtypes=seen_dtypes, @@ -545,14 +449,21 @@ def generate_code(kernel, device=None): seen_atomic_dtypes=seen_atomic_dtypes, var_subst_map={}, allow_complex=allow_complex, - var_name_generator=kernel.get_var_name_generator()) + var_name_generator=kernel.get_var_name_generator(), + is_generating_device_code=False, + gen_program_name=kernel.name, + schedule_index_end=len(kernel.schedule)) + + from loopy.codegen.result import generate_host_or_device_program + codegen_result = generate_host_or_device_program( + codegen_state, + schedule_index=0) - code_str, implemented_domains = kernel.target.generate_code( - kernel, codegen_state, impl_arg_info) + device_code_str = codegen_result.device_code() from loopy.check import check_implemented_domains - assert check_implemented_domains(kernel, implemented_domains, - code_str) + assert check_implemented_domains(kernel, codegen_result.implemented_domains, + device_code_str) # {{{ handle preambles @@ -576,7 +487,7 @@ def generate_code(kernel, device=None): seen_atomic_dtypes=seen_atomic_dtypes) preamble_generators = (kernel.preamble_generators - + kernel.target.preamble_generators()) + + kernel.target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) @@ -595,71 +506,48 @@ def generate_code(kernel, device=None): remove_common_indentation(lines) + "\n" for lines in dedup_preambles] - code_str = "".join(preamble_codes) + code_str + codegen_result = codegen_result.copy( + device_preambles=preamble_codes) # }}} logger.info("%s: generate code: done" % kernel.name) - result = code_str, impl_arg_info - if CACHING_ENABLED: - code_gen_cache[input_kernel] = result + code_gen_cache[input_kernel] = codegen_result - return result + return codegen_result -# }}} - - -# {{{ generate function body -def generate_body(kernel): - if kernel.schedule is None: - from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) - from loopy.kernel import kernel_state - if kernel.state != kernel_state.SCHEDULED: - raise LoopyError("cannot generate code for a kernel that has not been " - "scheduled") +def generate_code(kernel, device=None): + if device is not None: + from warnings import warn + warn("passing 'device' to generate_code() is deprecated", + DeprecationWarning, stacklevel=2) - from loopy.preprocess import infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) + codegen_result = generate_code_v2(kernel) - from loopy.check import pre_codegen_checks - pre_codegen_checks(kernel) + if len(codegen_result.device_programs) > 1: + raise LoopyError("kernel passed to generate_code yielded multiple " + "device programs. Use generate_code_v2.") - logger.info("%s: generate code: start" % kernel.name) + return codegen_result.device_code(), codegen_result.implemented_data_info - allow_complex = False - for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): - if var.dtype.involves_complex(): - allow_complex = True +# }}} - seen_dtypes = set() - seen_functions = set() - seen_atomic_dtypes = set() - initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) - codegen_state = CodeGenerationState( - kernel=kernel, - implemented_domain=initial_implemented_domain, - implemented_predicates=frozenset(), - seen_dtypes=seen_dtypes, - seen_functions=seen_functions, - seen_atomic_dtypes=seen_atomic_dtypes, - var_subst_map={}, - allow_complex=allow_complex) +# {{{ generate function body - code_str, implemented_domains = kernel.target.generate_body( - kernel, codegen_state) +def generate_body(kernel): + codegen_result = generate_code_v2(kernel) - from loopy.check import check_implemented_domains - assert check_implemented_domains(kernel, implemented_domains, - code_str) + if len(codegen_result.device_programs) != 1: + raise LoopyError("generate_body cannot be used on programs " + "that yield more than one device program") - logger.info("%s: generate code: done" % kernel.name) + dev_prg, = codegen_result.device_programs - return code_str + return str(dev_prg.body_ast) # }}} diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index 19ac4106ba58821e5d4bf5231eb530977739c7f3..fb254bd54480f716de54de96f6aab9a4bb427767 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -25,17 +25,6 @@ THE SOFTWARE. import islpy as isl from islpy import dim_type -from pymbolic.mapper.stringifier import PREC_NONE - - -def constraint_to_code(ecm, cns): - if cns.is_equality(): - comp_op = "==" - else: - comp_op = ">=" - - from loopy.symbolic import constraint_to_expr - return "%s %s 0" % (ecm(constraint_to_expr(cns), PREC_NONE, "i"), comp_op) # {{{ bounds check generator diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 0b509fad8ec2d3ae6a21d5a228e0fd578cb0ab4c..7b73093873e7a5364945f8888723be639f37dcf4 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -27,17 +27,20 @@ THE SOFTWARE. """ -from loopy.codegen import gen_code_block +from loopy.codegen.result import merge_codegen_results, wrap_in_if import islpy as isl -from loopy.schedule import (EnterLoop, LeaveLoop, RunInstruction, Barrier, - gather_schedule_subloop, generate_sub_sched_items) +from loopy.schedule import ( + EnterLoop, LeaveLoop, RunInstruction, Barrier, CallKernel, + gather_schedule_block, generate_sub_sched_items) -def get_admissible_conditional_inames_for(kernel, sched_index): +def get_admissible_conditional_inames_for(codegen_state, sched_index): """This function disallows conditionals on local-idx tagged inames if there is a barrier nested somewhere within. """ + kernel = codegen_state.kernel + from loopy.kernel.data import LocalIndexTag, HardwareParallelTag from loopy.schedule import find_active_inames_at, has_barrier_within @@ -46,17 +49,48 @@ def get_admissible_conditional_inames_for(kernel, sched_index): has_barrier = has_barrier_within(kernel, sched_index) for iname, tag in six.iteritems(kernel.iname_to_tag): - if isinstance(tag, HardwareParallelTag): + if (isinstance(tag, HardwareParallelTag) + and codegen_state.is_generating_device_code): if not has_barrier or not isinstance(tag, LocalIndexTag): result.add(iname) return frozenset(result) -def generate_code_for_sched_index(kernel, sched_index, codegen_state): +def generate_code_for_sched_index(codegen_state, sched_index): + kernel = codegen_state.kernel sched_item = kernel.schedule[sched_index] - if isinstance(sched_item, EnterLoop): + if isinstance(sched_item, CallKernel): + assert not codegen_state.is_generating_device_code + + from loopy.schedule import (gather_schedule_block, get_insn_ids_for_block_at) + _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) + assert past_end_i <= codegen_state.schedule_index_end + + new_codegen_state = codegen_state.copy( + is_generating_device_code=True, + gen_program_name=sched_item.kernel_name, + schedule_index_end=past_end_i-1) + + from loopy.codegen.result import generate_host_or_device_program + codegen_result = generate_host_or_device_program( + new_codegen_state, sched_index + 1) + + glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( + get_insn_ids_for_block_at(kernel.schedule, sched_index)) + + return merge_codegen_results(codegen_state, [ + codegen_result, + + codegen_state.ast_builder.get_kernel_call( + codegen_state, + sched_item.kernel_name, + glob_grid, loc_grid, + ()), + ]) + + elif isinstance(sched_item, EnterLoop): tag = kernel.iname_to_tag.get(sched_item.iname) from loopy.codegen.loop import ( @@ -76,10 +110,11 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state): raise RuntimeError("encountered (invalid) EnterLoop " "for '%s', tagged '%s'" % (sched_item.iname, tag)) - return func(kernel, sched_index, codegen_state) + return func(codegen_state, sched_index) elif isinstance(sched_item, Barrier): - return kernel.target.emit_barrier(sched_item.kind, sched_item.comment) + return codegen_state.ast_builder.emit_barrier( + sched_item.kind, sched_item.comment) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] @@ -87,7 +122,7 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state): from loopy.codegen.instruction import generate_instruction_code return codegen_state.try_vectorized( "instruction %s" % insn.id, - lambda inner_cgs: generate_instruction_code(kernel, insn, inner_cgs)) + lambda inner_cgs: generate_instruction_code(inner_cgs, insn)) else: raise RuntimeError("unexpected schedule item type: %s" @@ -132,18 +167,20 @@ def group_by(l, key, merge): return result -def build_loop_nest(kernel, sched_index, codegen_state): +def build_loop_nest(codegen_state, schedule_index): # Most of the complexity of this function goes towards finding groups of # instructions that can be nested inside a shared conditional. + kernel = codegen_state.kernel + # {{{ pass 1: pre-scan schedule for my schedule item's siblings' indices # i.e. go up to the next LeaveLoop, and skip over inner loops. my_sched_indices = [] - i = sched_index - while i < len(kernel.schedule): + i = schedule_index + while i < codegen_state.schedule_index_end: sched_item = kernel.schedule[i] if isinstance(sched_item, LeaveLoop): @@ -151,9 +188,11 @@ def build_loop_nest(kernel, sched_index, codegen_state): my_sched_indices.append(i) - if isinstance(sched_item, EnterLoop): - _, i = gather_schedule_subloop( - kernel.schedule, i) + if isinstance(sched_item, (EnterLoop, CallKernel)): + _, i = gather_schedule_block(kernel.schedule, i) + assert i <= codegen_state.schedule_index_end, \ + "schedule block extends beyond schedule_index_end" + elif isinstance(sched_item, Barrier): i += 1 @@ -184,7 +223,7 @@ def build_loop_nest(kernel, sched_index, codegen_state): ScheduleIndexInfo( schedule_indices=[i], admissible_cond_inames=( - get_admissible_conditional_inames_for(kernel, i)), + get_admissible_conditional_inames_for(codegen_state, i)), required_predicates=get_required_predicates(kernel, i), used_inames_within=find_used_inames_within(kernel, i) ) @@ -353,7 +392,7 @@ def build_loop_nest(kernel, sched_index, codegen_state): result = [] for i in origin_si_entry.schedule_indices: inner = generate_code_for_sched_index( - kernel, i, inner_codegen_state) + inner_codegen_state, i) if inner is not None: result.append(inner) @@ -372,22 +411,23 @@ def build_loop_nest(kernel, sched_index, codegen_state): # gen_code returns a list if bounds_checks or pred_checks: - from loopy.codegen import wrap_in_if - from loopy.codegen.bounds import constraint_to_code + from loopy.symbolic import constraint_to_expr prev_gen_code = gen_code def gen_code(inner_codegen_state): - conditionals = [ - constraint_to_code( - inner_codegen_state.expression_to_code_mapper, cns) - for cns in bounds_checks] + list(pred_checks) + from pymbolic.primitives import Variable + condition_exprs = [ + constraint_to_expr(cns) + for cns in bounds_checks] + [ + Variable(pred_chk) for pred_chk in pred_checks] prev_result = prev_gen_code(inner_codegen_state) return [wrap_in_if( - conditionals, - gen_code_block(prev_result))] + inner_codegen_state, + condition_exprs, + merge_codegen_results(codegen_state, prev_result))] cannot_vectorize = False if new_codegen_state.vectorization_info is not None: @@ -404,7 +444,7 @@ def build_loop_nest(kernel, sched_index, codegen_state): # gen_code returns a list, but this needs to return a # GeneratedCode instance. - return gen_code_block(gen_code(inner_codegen_state)) + return gen_code(inner_codegen_state) result = [new_codegen_state.unvectorize(gen_code_wrapper)] else: @@ -418,8 +458,10 @@ def build_loop_nest(kernel, sched_index, codegen_state): # }}} - return gen_code_block( - build_insn_group(sched_index_info_entries, codegen_state)) + insn_group = build_insn_group(sched_index_info_entries, codegen_state) + return merge_codegen_results( + codegen_state, + insn_group) # vim: foldmethod=marker diff --git a/loopy/codegen/device_mapping.py b/loopy/codegen/device_mapping.py index d40b0c4d255eda0159c77fbf56889b0a55e5f347..af6f9712a73da22cdc461a76444adaf0dd4b9e6e 100644 --- a/loopy/codegen/device_mapping.py +++ b/loopy/codegen/device_mapping.py @@ -1,25 +1,43 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2016 Matt Wala" + +__license__ = """ +(unclear) +""" + +# TODO: Matt, please replace the license header +# TODO: Should move to loopy.schedule.device_mapping + from pytools import Record +from loopy.diagnostic import LoopyError class HostForLoop(Record): + # TOOD: Should have docstring indicating what attributes can occur pass class HostConditional(Record): + # TOOD: Should have docstring indicating what attributes can occur pass class HostBlock(Record): + # TOOD: Should have docstring indicating what attributes can occur pass class HostInvokeKernel(Record): + # TOOD: Should have docstring indicating what attributes can occur pass def map_schedule_onto_host_or_device(kernel): - from pytools import UniqueNameGenerator - kernel_name_gen = UniqueNameGenerator(forced_prefix=kernel.name) + from functools import partial + kernel_name_gen = partial( + kernel.get_var_name_generator(), + kernel.name + kernel.target.device_program_name_suffix) from loopy.schedule import ( RunInstruction, EnterLoop, LeaveLoop, Barrier, @@ -67,7 +85,6 @@ def map_schedule_onto_host_or_device(kernel): if loop_required_splitting: schedule_required_splitting = True if current_chunk: - # TODO: Do a better job of naming the kernel... new_kernel_name = kernel_name_gen() new_schedule.extend( # TODO: Infer kernel arguments @@ -92,7 +109,6 @@ def map_schedule_onto_host_or_device(kernel): # Wrap the current chunk into a kernel call. schedule_required_splitting = True if current_chunk: - # TODO: Do a better job of naming the kernel new_kernel_name = kernel_name_gen() new_schedule.extend( # TODO: Infer kernel arguments @@ -106,8 +122,8 @@ def map_schedule_onto_host_or_device(kernel): current_chunk.append(sched_item) i += 1 else: - # TODO: Make error message more informative. - raise ValueError() + raise LoopyError("unexepcted type of schedule item: %s" + % type(sched_item).__name__) if current_chunk and schedule_required_splitting: # Wrap remainder of schedule into a kernel call. @@ -131,7 +147,7 @@ def map_schedule_onto_host_or_device(kernel): if not split_kernel: # Wrap everything into a kernel call. new_schedule = ( - [CallKernel(kernel_name=kernel.name)] + + [CallKernel(kernel_name=kernel_name_gen())] + new_schedule + [ReturnFromKernel(kernel_name=kernel.name)]) new_kernel = kernel.copy(schedule=new_schedule) diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 7b95f59482c826473e069d9f07a80dd97d7ea0ce..db3d15184a94aa13e6a4b449a2036869e34ca566 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -27,12 +27,16 @@ THE SOFTWARE. from six.moves import range import islpy as isl -from loopy.codegen import GeneratedInstruction, Unvectorizable +from loopy.codegen import Unvectorizable +from loopy.codegen.result import CodeGenerationResult from pymbolic.mapper.stringifier import PREC_NONE -def wrap_in_conditionals(codegen_state, domain, check_inames, required_preds, stmt): - from loopy.codegen.bounds import get_bounds_checks, constraint_to_code +def to_codegen_result( + codegen_state, insn_id, domain, check_inames, required_preds, ast): + from loopy.codegen.bounds import get_bounds_checks + from loopy.symbolic import constraint_to_expr + bounds_checks = get_bounds_checks( domain, check_inames, codegen_state.implemented_domain, overapproximate=False) @@ -43,53 +47,54 @@ def wrap_in_conditionals(codegen_state, domain, check_inames, required_preds, st new_implemented_domain = new_implemented_domain & bounds_check_set if bounds_check_set.is_empty(): - return None, None + return None - condition_codelets = [ - constraint_to_code( - codegen_state.expression_to_code_mapper, cns) + condition_exprs = [ + constraint_to_expr(cns) for cns in bounds_checks] - condition_codelets.extend( + condition_exprs.extend( required_preds - codegen_state.implemented_predicates) - if condition_codelets: - from cgen import If - stmt = If("\n&& ".join(condition_codelets), stmt) + if condition_exprs: + from pymbolic.primitives import LogicalAnd + from pymbolic.mapper.stringifier import PREC_NONE + ast = codegen_state.ast_builder.emit_if( + codegen_state.expression_to_code_mapper( + LogicalAnd(tuple(condition_exprs)), PREC_NONE), + ast) - return stmt, new_implemented_domain + return CodeGenerationResult.new( + codegen_state, insn_id, ast, new_implemented_domain) -def generate_instruction_code(kernel, insn, codegen_state): +def generate_instruction_code(codegen_state, insn): + kernel = codegen_state.kernel + from loopy.kernel.data import Assignment, CallInstruction, CInstruction if isinstance(insn, Assignment): - result = generate_expr_instruction_code(kernel, insn, codegen_state) + ast = generate_assignment_instruction_code(codegen_state, insn) elif isinstance(insn, CallInstruction): - result = generate_call_code(kernel, insn, codegen_state) + ast = generate_call_code(codegen_state, insn) elif isinstance(insn, CInstruction): - result = generate_c_instruction_code(kernel, insn, codegen_state) + ast = generate_c_instruction_code(codegen_state, insn) else: raise RuntimeError("unexpected instruction type") insn_inames = kernel.insn_inames(insn) - insn_code, impl_domain = wrap_in_conditionals( + return to_codegen_result( codegen_state, + insn.id, kernel.get_inames_domain(insn_inames), insn_inames, insn.predicates, - result) + ast) - if insn_code is None: - return None - return GeneratedInstruction( - insn_id=insn.id, - implemented_domain=impl_domain, - ast=insn_code) +def generate_assignment_instruction_code(codegen_state, insn): + kernel = codegen_state.kernel - -def generate_expr_instruction_code(kernel, insn, codegen_state): ecm = codegen_state.expression_to_code_mapper from loopy.expression import dtype_to_type_context, VectorizabilityChecker @@ -137,8 +142,8 @@ def generate_expr_instruction_code(kernel, insn, codegen_state): lhs_code = ecm(insn.assignee, prec=PREC_NONE, type_context=None) rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype) if lhs_atomicity is None: - from cgen import Assign - result = Assign( + result = codegen_state.ast_builder.emit_assignment( + codegen_state, lhs_code, ecm(insn.expression, prec=PREC_NONE, type_context=rhs_type_context, @@ -149,7 +154,7 @@ def generate_expr_instruction_code(kernel, insn, codegen_state): elif isinstance(lhs_atomicity, AtomicUpdate): codegen_state.seen_atomic_dtypes.add(lhs_dtype) - result = kernel.target.generate_atomic_update( + result = codegen_state.ast_builder.generate_atomic_update( kernel, codegen_state, lhs_atomicity, lhs_var, insn.assignee, insn.expression, lhs_dtype, rhs_type_context) @@ -166,7 +171,7 @@ def generate_expr_instruction_code(kernel, insn, codegen_state): from cgen import Statement as S # noqa - gs, ls = kernel.get_grid_sizes() + gs, ls = kernel.get_grid_size_upper_bounds() printf_format = "%s.%s[%s][%s]: %s" % ( kernel.name, @@ -220,7 +225,9 @@ def generate_expr_instruction_code(kernel, insn, codegen_state): return result -def generate_call_code(kernel, insn, codegen_state): +def generate_call_code(codegen_state, insn): + kernel = codegen_state.kernel + # {{{ vectorization handling if codegen_state.vectorization_info: @@ -229,7 +236,7 @@ def generate_call_code(kernel, insn, codegen_state): # }}} - result = kernel.target.generate_multiple_assignment( + result = codegen_state.ast_builder.emit_multiple_assignment( codegen_state, insn) # {{{ tracing @@ -242,13 +249,15 @@ def generate_call_code(kernel, insn, codegen_state): return result -def generate_c_instruction_code(kernel, insn, codegen_state): +def generate_c_instruction_code(codegen_state, insn): + kernel = codegen_state.kernel + if codegen_state.vectorization_info is not None: raise Unvectorizable("C instructions cannot be vectorized") body = [] - from loopy.codegen import POD + from loopy.target.c import POD from cgen import Initializer, Block, Line from pymbolic.primitives import Variable @@ -260,7 +269,7 @@ def generate_c_instruction_code(kernel, insn, codegen_state): body.append( Initializer( - POD(kernel.target, kernel.index_dtype, name), + POD(codegen_state.ast_builder, kernel.index_dtype, name), codegen_state.expression_to_code_mapper( iname_expr, prec=PREC_NONE, type_context="i"))) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 70530d3a90cbdd5a48395034dd05000f6781b473..be7840f27e92dc0339cd678e9dca40421462ac04 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -25,7 +25,7 @@ THE SOFTWARE. from six.moves import range from loopy.diagnostic import warn, LoopyError -from loopy.codegen import gen_code_block +from loopy.codegen.result import merge_codegen_results import islpy as isl from islpy import dim_type from loopy.codegen.control import build_loop_nest @@ -34,7 +34,7 @@ from pymbolic.mapper.stringifier import PREC_NONE # {{{ conditional-reducing slab decomposition -def get_slab_decomposition(kernel, iname, sched_index, codegen_state): +def get_slab_decomposition(kernel, iname): iname_domain = kernel.get_inames_domain(iname) if iname_domain.is_empty(): @@ -116,7 +116,9 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state): # {{{ unrolled loops -def generate_unroll_loop(kernel, sched_index, codegen_state): +def generate_unroll_loop(codegen_state, sched_index): + kernel = codegen_state.kernel + iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) @@ -147,16 +149,18 @@ def generate_unroll_loop(kernel, sched_index, codegen_state): idx_aff = lower_bound_aff + i new_codegen_state = codegen_state.fix(iname, idx_aff) result.append( - build_loop_nest(kernel, sched_index+1, new_codegen_state)) + build_loop_nest(new_codegen_state, sched_index+1)) - return gen_code_block(result) + return merge_codegen_results(codegen_state, result) # }}} # {{{ vectorized loops -def generate_vectorize_loop(kernel, sched_index, codegen_state): +def generate_vectorize_loop(codegen_state, sched_index): + kernel = codegen_state.kernel + iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) @@ -206,7 +210,7 @@ def generate_vectorize_loop(kernel, sched_index, codegen_state): length=length, space=length_aff.space)) - return build_loop_nest(kernel, sched_index+1, new_codegen_state) + return build_loop_nest(new_codegen_state, sched_index+1) # }}} @@ -222,8 +226,10 @@ def intersect_kernel_with_slab(kernel, slab, iname): # {{{ hw-parallel loop -def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, +def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, hw_inames_left=None): + kernel = codegen_state.kernel + from loopy.kernel.data import ( UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag) @@ -233,9 +239,11 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)] if not hw_inames_left: - return build_loop_nest(kernel, sched_index, codegen_state) + return next_func(codegen_state) - global_size, local_size = kernel.get_grid_sizes() + from loopy.schedule import get_insn_ids_for_block_at + global_size, local_size = kernel.get_grid_sizes_for_insn_ids( + get_insn_ids_for_block_at(kernel.schedule, schedule_index)) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() @@ -293,8 +301,7 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, # }}} - slabs = get_slab_decomposition( - kernel, iname, sched_index, codegen_state) + slabs = get_slab_decomposition(kernel, iname) if other_inames_with_same_tag and len(slabs) > 1: raise RuntimeError("cannot do slab decomposition on inames that share " @@ -302,38 +309,39 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, result = [] - from loopy.codegen import add_comment - for slab_name, slab in slabs: - cmt = "%s slab for '%s'" % (slab_name, iname) if len(slabs) == 1: - cmt = None + result.append( + codegen_state.ast_builder.emit_comment( + "%s slab for '%s'" % (slab_name, iname))) # Have the conditional infrastructure generate the # slabbing conditionals. slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname) - new_codegen_state = codegen_state.copy_and_assign(iname, hw_axis_expr) + new_codegen_state = (codegen_state + .copy_and_assign(iname, hw_axis_expr) + .copy(kernel=slabbed_kernel)) inner = set_up_hw_parallel_loops( - slabbed_kernel, sched_index, - new_codegen_state, hw_inames_left) + new_codegen_state, schedule_index, next_func, + hw_inames_left) - result.append(add_comment(cmt, inner)) + result.append(inner) - from loopy.codegen import gen_code_block - return gen_code_block(result) + return merge_codegen_results(codegen_state, result) # }}} # {{{ sequential loop -def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): +def generate_sequential_loop_dim_code(codegen_state, sched_index): + kernel = codegen_state.kernel + ecm = codegen_state.expression_to_code_mapper loop_iname = kernel.schedule[sched_index].iname - slabs = get_slab_decomposition( - kernel, loop_iname, sched_index, codegen_state) + slabs = get_slab_decomposition(kernel, loop_iname) from loopy.codegen.bounds import get_usable_inames_for_conditional @@ -411,40 +419,45 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): dim_type.set, impl_slab.dim(dim_type.set), dt, idx, 1) - new_codegen_state = codegen_state.intersect(impl_slab) + new_codegen_state = ( + codegen_state + .intersect(impl_slab) + .copy(kernel=intersect_kernel_with_slab( + kernel, slab, iname))) - inner = build_loop_nest( - intersect_kernel_with_slab( - kernel, slab, iname), - sched_index+1, new_codegen_state) + inner = build_loop_nest(new_codegen_state, sched_index+1) # }}} if cmt is not None: - from cgen import Comment - result.append(Comment(cmt)) + result.append(codegen_state.ast_builder.emit_comment(cmt)) - from loopy.codegen import POD - from cgen import Initializer, Const, Line from loopy.symbolic import aff_to_expr + astb = codegen_state.ast_builder + if (static_ubound - static_lbound).plain_is_zero(): # single-trip, generate just a variable assignment, not a loop - result.append(gen_code_block([ - Initializer( - Const(POD(kernel.target, kernel.index_dtype, loop_iname)), - ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), - Line(), + result.append(merge_codegen_results(codegen_state, [ + astb.emit_initializer( + codegen_state, + kernel.index_dtype, loop_iname, + ecm(aff_to_expr(static_lbound), PREC_NONE, "i"), + is_const=True), + astb.emit_blank_line(), inner, ])) else: + inner_ast = inner.current_ast(codegen_state) result.append( - kernel.target.emit_sequential_loop( - codegen_state, loop_iname, kernel.index_dtype, - static_lbound, static_ubound, inner)) + inner.with_new_ast( + codegen_state, + astb.emit_sequential_loop( + codegen_state, loop_iname, kernel.index_dtype, + static_lbound, static_ubound, inner_ast))) - return gen_code_block(result) + return merge_codegen_results(codegen_state, result) # }}} diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py new file mode 100644 index 0000000000000000000000000000000000000000..9d2c44fecb70ccb60304a0ef473ac9315c762880 --- /dev/null +++ b/loopy/codegen/result.py @@ -0,0 +1,280 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2016 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six +from pytools import Record + + +# {{{ code generation result + +class GeneratedProgram(Record): + """ + .. attribute:: name + + .. attribute:: is_device_program + + .. attribute:: ast + + Once generated, this captures the AST of the overall function + definition, including the body. + + .. attribute:: body_ast + + Once generated, this captures the AST of the operative function + body (including declaration of necessary temporaries), but not + the overall function definition. + """ + + +class CodeGenerationResult(Record): + """ + .. attribute:: host_program + .. attribute:: device_programs + + A list of :class:`GeneratedProgram` instances + intended to run on the compute device. + + .. attribute:: implemented_domains + + A mapping from instruction ID to a list of :class:`islpy.Set` + objects. + + .. attribute:: host_preambles + .. attribute:: device_preambles + + .. automethod:: host_code + .. automethod:: device_code + + .. attribute:: implemented_data_info + + a list of :class:`loopy.codegen.ImplementedDataInfo` objects. + Only added at the very end of code generation. + """ + + @staticmethod + def new(codegen_state, insn_id, ast, implemented_domain): + prg = GeneratedProgram( + name=codegen_state.gen_program_name, + is_device_program=codegen_state.is_generating_device_code, + ast=ast) + + if codegen_state.is_generating_device_code: + kwargs = { + "host_program": None, + "device_programs": [prg], + } + else: + kwargs = { + "host_program": prg, + "device_programs": [], + } + + return CodeGenerationResult( + implemented_data_info=codegen_state.implemented_data_info, + implemented_domains={insn_id: [implemented_domain]}, + **kwargs) + + def host_code(self): + preamble_codes = getattr(self, "host_preambles", []) + + return ( + "".join(preamble_codes) + + + str(self.host_program.ast)) + + def device_code(self): + preamble_codes = getattr(self, "device_preambles", []) + + return ( + "".join(preamble_codes) + + "\n" + + "\n\n".join(str(dp.ast) for dp in self.device_programs)) + + def current_program(self, codegen_state): + if codegen_state.is_generating_device_code: + if self.device_programs: + result = self.device_programs[-1] + else: + result = None + else: + result = self.host_program + + if result is None: + ast = codegen_state.ast_builder.ast_block_class([]) + result = GeneratedProgram( + name=codegen_state.gen_program_name, + is_device_program=codegen_state.is_generating_device_code, + ast=ast) + + assert result.name == codegen_state.gen_program_name + return result + + def with_new_program(self, codegen_state, program): + if codegen_state.is_generating_device_code: + assert program.name == codegen_state.gen_program_name + assert program.is_device_program + return self.copy( + device_programs=( + self.device_programs[:-1] + + + [program])) + else: + assert program.name == codegen_state.gen_program_name + assert not program.is_device_program + return self.copy(host_program=program) + + def current_ast(self, codegen_state): + return self.current_program(codegen_state).ast + + def with_new_ast(self, codegen_state, new_ast): + return self.with_new_program( + codegen_state, + self.current_program(codegen_state).copy( + ast=new_ast)) + + +# }}} + + +# {{{ support code for AST merging + +def merge_codegen_results(codegen_state, elements, collapse=True): + elements = [el for el in elements if el is not None] + + if not elements: + return CodeGenerationResult( + host_program=None, + device_programs=[], + implemented_domains={}, + implemented_data_info=codegen_state.implemented_data_info) + + ast_els = [] + new_device_programs = [] + dev_program_names = set() + implemented_domains = {} + codegen_result = None + + block_cls = codegen_state.ast_builder.ast_block_class + + for el in elements: + if isinstance(el, CodeGenerationResult): + if codegen_result is None: + codegen_result = el + else: + assert ( + el.current_program(codegen_state).name + == codegen_result.current_program(codegen_state).name) + + for insn_id, idoms in six.iteritems(el.implemented_domains): + implemented_domains.setdefault(insn_id, []).extend(idoms) + + if not codegen_state.is_generating_device_code: + for dp in el.device_programs: + if dp.name not in dev_program_names: + new_device_programs.append(dp) + dev_program_names.add(dp.name) + + cur_ast = el.current_ast(codegen_state) + if isinstance(cur_ast, block_cls): + ast_els.extend(cur_ast.contents) + else: + ast_els.append(cur_ast) + + else: + ast_els.append(el) + + if collapse and len(ast_els) == 1: + ast, = ast_els + else: + ast = block_cls(ast_els) + + kwargs = {} + if not codegen_state.is_generating_device_code: + kwargs["device_programs"] = new_device_programs + + return (codegen_result + .with_new_ast(codegen_state, ast) + .copy( + implemented_domains=implemented_domains, + **kwargs)) + + +def wrap_in_if(codegen_state, condition_exprs, inner): + if condition_exprs: + from pymbolic.primitives import LogicalAnd + from pymbolic.mapper.stringifier import PREC_NONE + cur_ast = inner.current_ast(codegen_state) + return inner.with_new_ast( + codegen_state, + codegen_state.ast_builder.emit_if( + codegen_state.expression_to_code_mapper( + LogicalAnd(tuple(condition_exprs)), PREC_NONE), + cur_ast)) + + return inner + +# }}} + + +# {{{ program generation top-level + +def generate_host_or_device_program(codegen_state, schedule_index): + ast_builder = codegen_state.ast_builder + temp_decls = ast_builder.get_temporary_decls(codegen_state) + + from functools import partial + + from loopy.codegen.control import build_loop_nest + next_func = partial(build_loop_nest, schedule_index=schedule_index) + + if codegen_state.is_generating_device_code: + from loopy.codegen.loop import set_up_hw_parallel_loops + codegen_result = set_up_hw_parallel_loops( + codegen_state, schedule_index, next_func=next_func) + else: + codegen_result = next_func(codegen_state) + + codegen_result = merge_codegen_results( + codegen_state, + temp_decls + [codegen_result], + collapse=False) + + cur_prog = codegen_result.current_program(codegen_state) + body_ast = cur_prog.ast + fdecl_ast = ast_builder.get_function_declaration( + codegen_state, codegen_result, schedule_index) + + fdef_ast = ast_builder.get_function_definition( + codegen_state, codegen_result, + schedule_index, fdecl_ast, body_ast) + + codegen_result = codegen_result.with_new_program( + codegen_state, + cur_prog.copy( + ast=fdef_ast, + body_ast=body_ast)) + + return codegen_result + +# }}} diff --git a/loopy/compiled.py b/loopy/compiled.py index 2125fd66347c99adb4014a60c34a6ce44bc05e95..a30e4b133c60987213d5a62482b7a04a21cc05ad 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -25,7 +25,6 @@ THE SOFTWARE. import six from six.moves import range, zip -import sys import numpy as np from pytools import Record, memoize_method from loopy.diagnostic import ParameterFinderWarning @@ -137,7 +136,7 @@ def python_dtype_str(dtype): # {{{ integer arg finding from shapes -def generate_integer_arg_finding_from_shapes(gen, kernel, impl_arg_info, options): +def generate_integer_arg_finding_from_shapes(gen, kernel, implemented_data_info): # a mapping from integer argument names to a list of tuples # (arg_name, expression), where expression is a # unary function of kernel.arg_dict[arg_name] @@ -149,7 +148,7 @@ def generate_integer_arg_finding_from_shapes(gen, kernel, impl_arg_info, options dep_map = DependencyMapper() from pymbolic import var - for arg in impl_arg_info: + for arg in implemented_data_info: if arg.arg_class is GlobalArg: sym_shape = var(arg.name).attr("shape") for axis_nr, shape_i in enumerate(arg.shape): @@ -208,11 +207,13 @@ def generate_integer_arg_finding_from_shapes(gen, kernel, impl_arg_info, options # {{{ integer arg finding from offsets -def generate_integer_arg_finding_from_offsets(gen, kernel, impl_arg_info, options): +def generate_integer_arg_finding_from_offsets(gen, kernel, implemented_data_info): + options = kernel.options + gen("# {{{ find integer arguments from offsets") gen("") - for arg in impl_arg_info: + for arg in implemented_data_info: impl_array_name = arg.offset_for_name if impl_array_name is not None: gen("if %s is None:" % arg.name) @@ -255,11 +256,13 @@ def generate_integer_arg_finding_from_offsets(gen, kernel, impl_arg_info, option # {{{ integer arg finding from strides -def generate_integer_arg_finding_from_strides(gen, kernel, impl_arg_info, options): +def generate_integer_arg_finding_from_strides(gen, kernel, implemented_data_info): + options = kernel.options + gen("# {{{ find integer arguments from strides") gen("") - for arg in impl_arg_info: + for arg in implemented_data_info: if arg.stride_for_name_and_axis is not None: impl_array_name, stride_impl_axis = arg.stride_for_name_and_axis @@ -297,156 +300,9 @@ def generate_integer_arg_finding_from_strides(gen, kernel, impl_arg_info, option # }}} -# {{{ value arg setup - -def generate_value_arg_setup(gen, kernel, cl_kernel, impl_arg_info, options): - import loopy as lp - from loopy.kernel.array import ArrayBase - - # {{{ arg counting bug handling - - # For example: - # https://github.com/pocl/pocl/issues/197 - # (but Apple CPU has a similar bug) - - work_around_arg_count_bug = False - warn_about_arg_count_bug = False - - devices = cl_kernel.context.devices - - try: - from pyopencl.characterize import has_struct_arg_count_bug - - except ImportError: - count_bug_per_dev = [False]*len(devices) - - else: - count_bug_per_dev = [ - has_struct_arg_count_bug(dev) - for dev in devices] - - if any(count_bug_per_dev): - if all(count_bug_per_dev): - work_around_arg_count_bug = True - else: - warn_about_arg_count_bug = True - - # }}} - - cl_arg_idx = 0 - arg_idx_to_cl_arg_idx = {} - - fp_arg_count = 0 - - for arg_idx, arg in enumerate(impl_arg_info): - arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx - - if arg.arg_class is not lp.ValueArg: - assert issubclass(arg.arg_class, ArrayBase) - - # assume each of those generates exactly one... - cl_arg_idx += 1 - - continue - - gen("# {{{ process %s" % arg.name) - gen("") - - if not options.skip_arg_checks: - gen(""" - if {name} is None: - raise RuntimeError("input argument '{name}' must " - "be supplied") - """.format(name=arg.name)) - - if sys.version_info < (2, 7) and arg.dtype.is_integral(): - gen("# cast to long to avoid trouble with struct packing") - gen("%s = long(%s)" % (arg.name, arg.name)) - gen("") - - if arg.dtype.is_composite(): - gen("cl_kernel.set_arg(%d, %s)" % (cl_arg_idx, arg.name)) - cl_arg_idx += 1 - - elif arg.dtype.is_complex(): - assert isinstance(arg.dtype, NumpyType) - - dtype = arg.dtype - - if warn_about_arg_count_bug: - from warnings import warn - warn("{knl_name}: arguments include complex numbers, and " - "some (but not all) of the target devices mishandle " - "struct kernel arguments (hence the workaround is " - "disabled".format( - knl_name=kernel.name)) - - if dtype.numpy_dtype == np.complex64: - arg_char = "f" - elif dtype.numpy_dtype == np.complex128: - arg_char = "d" - else: - raise TypeError("unexpected complex type: %s" % dtype) - - if (work_around_arg_count_bug - and dtype.numpy_dtype == np.complex128 - and fp_arg_count + 2 <= 8): - gen( - "buf = _lpy_pack('{arg_char}', {arg_var}.real)" - .format(arg_char=arg_char, arg_var=arg.name)) - gen( - "cl_kernel.set_arg({cl_arg_idx}, buf)" - .format(cl_arg_idx=cl_arg_idx)) - cl_arg_idx += 1 - - gen( - "buf = _lpy_pack('{arg_char}', {arg_var}.imag)" - .format(arg_char=arg_char, arg_var=arg.name)) - gen( - "cl_kernel.set_arg({cl_arg_idx}, buf)" - .format(cl_arg_idx=cl_arg_idx)) - cl_arg_idx += 1 - else: - gen( - "buf = _lpy_pack('{arg_char}{arg_char}', " - "{arg_var}.real, {arg_var}.imag)" - .format(arg_char=arg_char, arg_var=arg.name)) - gen( - "cl_kernel.set_arg({cl_arg_idx}, buf)" - .format(cl_arg_idx=cl_arg_idx)) - cl_arg_idx += 1 - - fp_arg_count += 2 - - elif isinstance(arg.dtype, NumpyType): - if arg.dtype.dtype.kind == "f": - fp_arg_count += 1 - - gen("cl_kernel.set_arg(%d, _lpy_pack('%s', %s))" - % (cl_arg_idx, arg.dtype.dtype.char, arg.name)) - - cl_arg_idx += 1 - - else: - raise LoopyError("do not know how to pass argument of type '%s'" - % arg.dtype) - - gen("") - - gen("# }}}") - gen("") - - assert cl_arg_idx == cl_kernel.num_args - - return arg_idx_to_cl_arg_idx - -# }}} - - -# {{{ array arg setup +# {{{ arg setup -def generate_array_arg_setup(gen, kernel, impl_arg_info, options, - arg_idx_to_cl_arg_idx): +def generate_arg_setup(gen, kernel, implemented_data_info, options): import loopy as lp from loopy.kernel.array import ArrayBase @@ -461,13 +317,16 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options, gen("_lpy_encountered_dev = False") gen("") + args = [] + strify = StringifyMapper() - for arg_idx, arg in enumerate(impl_arg_info): + for arg_idx, arg in enumerate(implemented_data_info): is_written = arg.base_name in kernel.get_written_variables() kernel_arg = kernel.impl_arg_to_arg.get(arg.name) if not issubclass(arg.arg_class, ArrayBase): + args.append(arg.name) continue gen("# {{{ process %s" % arg.name) @@ -667,12 +526,11 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options, gen("del _lpy_made_by_loopy") gen("") - cl_arg_idx = arg_idx_to_cl_arg_idx[arg_idx] - if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]: - gen("cl_kernel.set_arg(%d, %s.base_data)" % (cl_arg_idx, arg.name)) + args.append("%s.base_data" % arg.name) else: - gen("cl_kernel.set_arg(%d, %s)" % (cl_arg_idx, arg.name)) + args.append("%s" % arg.name) + gen("") gen("# }}}") @@ -681,19 +539,23 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options, gen("# }}}") gen("") + return args + # }}} -def generate_invoker(kernel, cl_kernel, impl_arg_info, options): +def generate_invoker(kernel, implemented_data_info, host_code): + options = kernel.options + system_args = [ - "cl_kernel", "queue", "allocator=None", "wait_for=None", + "_lpy_cl_kernels", "queue", "allocator=None", "wait_for=None", # ignored if options.no_numpy "out_host=None" ] gen = PythonFunctionGenerator( "invoke_%s_loopy_kernel" % kernel.name, - system_args + ["%s=None" % iai.name for iai in impl_arg_info]) + system_args + ["%s=None" % iai.name for iai in implemented_data_info]) gen.add_to_preamble("from __future__ import division") gen.add_to_preamble("") @@ -701,7 +563,8 @@ def generate_invoker(kernel, cl_kernel, impl_arg_info, options): gen.add_to_preamble("import pyopencl.array as _lpy_cl_array") gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools") gen.add_to_preamble("import numpy as _lpy_np") - gen.add_to_preamble("from struct import pack as _lpy_pack") + gen.add_to_preamble("") + gen.add_to_preamble(host_code) gen.add_to_preamble("") gen("if allocator is None:") @@ -709,37 +572,21 @@ def generate_invoker(kernel, cl_kernel, impl_arg_info, options): gen("allocator = _lpy_cl_tools.DeferredAllocator(queue.context)") gen("") - generate_integer_arg_finding_from_shapes(gen, kernel, impl_arg_info, options) - generate_integer_arg_finding_from_offsets(gen, kernel, impl_arg_info, options) - generate_integer_arg_finding_from_strides(gen, kernel, impl_arg_info, options) + generate_integer_arg_finding_from_shapes(gen, kernel, implemented_data_info) + generate_integer_arg_finding_from_offsets(gen, kernel, implemented_data_info) + generate_integer_arg_finding_from_strides(gen, kernel, implemented_data_info) - arg_idx_to_cl_arg_idx = \ - generate_value_arg_setup(gen, kernel, cl_kernel, impl_arg_info, options) - generate_array_arg_setup(gen, kernel, impl_arg_info, options, - arg_idx_to_cl_arg_idx) + args = generate_arg_setup(gen, kernel, implemented_data_info, options) # {{{ generate invocation - from loopy.symbolic import StringifyMapper - - strify = StringifyMapper() - gsize_expr, lsize_expr = kernel.get_grid_sizes_as_exprs() - - if not gsize_expr: - gsize_expr = (1,) - if not lsize_expr: - lsize_expr = (1,) - - def strify_tuple(t): - return "(%s,)" % ( - ", ".join("int(%s)" % strify(t_i) for t_i in t)) - - gen("_lpy_evt = _lpy_cl.enqueue_nd_range_kernel(queue, cl_kernel, " - "%(gsize)s, %(lsize)s, wait_for=wait_for, g_times_l=True)" - % dict( - gsize=strify_tuple(gsize_expr), - lsize=strify_tuple(lsize_expr))) - gen("") + gen("_lpy_evt = {kernel_name}({args})" + .format( + kernel_name=kernel.name, + args=", ".join( + ["_lpy_cl_kernels", "queue"] + + args + + ["wait_for=wait_for"]))) # }}} @@ -754,7 +601,7 @@ def generate_invoker(kernel, cl_kernel, impl_arg_info, options): gen("if out_host:") with Indentation(gen): gen("pass") # if no outputs (?!) - for arg_idx, arg in enumerate(impl_arg_info): + for arg_idx, arg in enumerate(implemented_data_info): is_written = arg.base_name in kernel.get_written_variables() if is_written: gen("%s = %s.get(queue=queue)" % (arg.name, arg.name)) @@ -764,11 +611,11 @@ def generate_invoker(kernel, cl_kernel, impl_arg_info, options): if options.return_dict: gen("return _lpy_evt, {%s}" % ", ".join("\"%s\": %s" % (arg.name, arg.name) - for arg in impl_arg_info + for arg in implemented_data_info if arg.base_name in kernel.get_written_variables())) else: out_args = [arg - for arg in impl_arg_info + for arg in implemented_data_info if arg.base_name in kernel.get_written_variables()] if out_args: gen("return _lpy_evt, (%s,)" @@ -801,6 +648,10 @@ class _CLKernelInfo(Record): pass +class _CLKernels(object): + pass + + class CompiledKernel: """An object connecting a kernel to a :class:`pyopencl.Context` for execution. @@ -819,7 +670,9 @@ class CompiledKernel: """ self.context = context - self.kernel = kernel + + from loopy.target.pyopencl import PyOpenCLTarget + self.kernel = kernel.copy(target=PyOpenCLTarget(context.devices[0])) self.packing_controller = SeparateArrayPackingController(kernel) @@ -865,11 +718,13 @@ class CompiledKernel: def cl_kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) - from loopy.codegen import generate_code - code, impl_arg_info = generate_code(kernel) + from loopy.codegen import generate_code_v2 + codegen_result = generate_code_v2(kernel) + + dev_code = codegen_result.device_code() if self.kernel.options.write_cl: - output = code + output = dev_code if self.kernel.options.highlight_cl: output = get_highlighted_cl_code(output) @@ -881,23 +736,31 @@ class CompiledKernel: if self.kernel.options.edit_cl: from pytools import invoke_editor - code = invoke_editor(code, "code.cl") + dev_code = invoke_editor(dev_code, "code.cl") import pyopencl as cl logger.info("%s: opencl compilation start" % self.kernel.name) - cl_program = cl.Program(self.context, code) - cl_kernel = getattr( - cl_program.build(options=kernel.options.cl_build_options), - kernel.name) + + cl_program = ( + cl.Program(self.context, dev_code) + .build(options=kernel.options.cl_build_options)) + + cl_kernels = _CLKernels() + for dp in codegen_result.device_programs: + setattr(cl_kernels, dp.name, getattr(cl_program, dp.name)) + logger.info("%s: opencl compilation done" % self.kernel.name) return _CLKernelInfo( kernel=kernel, - cl_kernel=cl_kernel, - impl_arg_info=impl_arg_info, + cl_kernels=cl_kernels, + implemented_data_info=codegen_result.implemented_data_info, invoker=generate_invoker( - kernel, cl_kernel, impl_arg_info, self.kernel.options)) + kernel, + codegen_result.implemented_data_info, + codegen_result.host_code(), + )) # {{{ debugging aids @@ -907,9 +770,9 @@ class CompiledKernel: kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype) - from loopy.codegen import generate_code - code, arg_info = generate_code(kernel) - return code + from loopy.codegen import generate_code_v2 + code = generate_code_v2(kernel) + return code.device_code() def get_highlighted_code(self, arg_to_dtype=None): return get_highlighted_cl_code( @@ -979,7 +842,7 @@ class CompiledKernel: frozenset(six.iteritems(arg_to_dtype))) return kernel_info.invoker( - kernel_info.cl_kernel, queue, allocator, wait_for, + kernel_info.cl_kernels, queue, allocator, wait_for, out_host, **kwargs) # }}} diff --git a/loopy/expression.py b/loopy/expression.py index 6fd49661d617f65d2cdbc5773c3a59955da8c19c..991f4a93e30a76a09b527e4fd326cfafff5e7569 100644 --- a/loopy/expression.py +++ b/loopy/expression.py @@ -231,7 +231,10 @@ class TypeInferenceMapper(CombineMapper): if expr.name in self.kernel.all_inames(): return self.kernel.index_dtype - result = self.kernel.mangle_symbol(expr.name) + result = self.kernel.mangle_symbol( + self.kernel.target.get_device_ast_builder(), + expr.name) + if result is not None: result_dtype, _ = result return result_dtype diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 7baea324346b79e40d7ca601a99644727dd66e95..9b2c896ac2e8defc20bb54ea5c0682b68a363263 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -198,7 +198,7 @@ class LoopKernel(RecordWithoutPickling): # When kernels get intersected in slab decomposition, # their grid sizes shouldn't change. This provides # a way to forward sub-kernel grid size requests. - get_grid_sizes=None): + get_grid_sizes_for_insn_ids=None): if cache_manager is None: from loopy.kernel.tools import SetOperationCacheManager @@ -264,9 +264,9 @@ class LoopKernel(RecordWithoutPickling): if np.iinfo(index_dtype.numpy_dtype).min >= 0: raise TypeError("index_dtype must be signed") - if get_grid_sizes is not None: + if get_grid_sizes_for_insn_ids is not None: # overwrites method down below - self.get_grid_sizes = get_grid_sizes + self.get_grid_sizes_for_insn_ids = get_grid_sizes_for_insn_ids if state not in [ kernel_state.INITIAL, @@ -307,8 +307,11 @@ class LoopKernel(RecordWithoutPickling): # {{{ function mangling - def mangle_function(self, identifier, arg_dtypes): - manglers = self.target.function_manglers() + self.function_manglers + def mangle_function(self, identifier, arg_dtypes, ast_builder=None): + if ast_builder is None: + ast_builder = self.target.get_device_ast_builder() + + manglers = ast_builder.function_manglers() + self.function_manglers for mangler in manglers: mangle_result = mangler(self, identifier, arg_dtypes) @@ -349,8 +352,8 @@ class LoopKernel(RecordWithoutPickling): # {{{ symbol mangling - def mangle_symbol(self, identifier): - manglers = self.target.symbol_manglers() + self.symbol_manglers + def mangle_symbol(self, ast_builder, identifier): + manglers = ast_builder.symbol_manglers() + self.symbol_manglers for mangler in manglers: result = mangler(self, identifier) @@ -895,9 +898,20 @@ class LoopKernel(RecordWithoutPickling): constants_only=True))) @memoize_method - def get_grid_sizes(self, ignore_auto=False): + def get_grid_sizes_for_insn_ids(self, insn_ids, ignore_auto=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of all instructions whose IDs are given + in *insn_ids*. + + :arg insn_ids: a :class:`frozenset` of instruction IDs + + *global_size* and *local_size* are :class:`islpy.PwAff` objects. + """ + all_inames_by_insns = set() - for insn in self.instructions: + for insn_id in insn_ids: + insn = self.id_to_insn[insn_id] + all_inames_by_insns |= self.insn_inames(insn) if not all_inames_by_insns <= self.all_inames(): @@ -973,8 +987,18 @@ class LoopKernel(RecordWithoutPickling): return (to_dim_tuple(global_sizes, "global"), to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) - def get_grid_sizes_as_exprs(self, ignore_auto=False): - grid_size, group_size = self.get_grid_sizes(ignore_auto) + def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, ignore_auto=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of all instructions whose IDs are given + in *insn_ids*. + + :arg insn_ids: a :class:`frozenset` of instruction IDs + + *global_size* and *local_size* are :mod:`pymbolic` expressions + """ + + grid_size, group_size = self.get_grid_sizes_for_insn_ids( + insn_ids, ignore_auto) def tup_to_exprs(tup): from loopy.symbolic import pw_aff_to_expr @@ -982,6 +1006,27 @@ class LoopKernel(RecordWithoutPickling): return tup_to_exprs(grid_size), tup_to_exprs(group_size) + def get_grid_size_upper_bounds(self, ignore_auto=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of *all* instructions in the kernel. + + *global_size* and *local_size* are :class:`islpy.PwAff` objects. + """ + return self.get_grid_sizes_for_insn_ids( + frozenset(insn.id for insn in self.instructions), + ignore_auto=ignore_auto) + + def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of *all* instructions in the kernel. + + *global_size* and *local_size* are :mod:`pymbolic` expressions + """ + + return self.get_grid_sizes_for_insn_ids_as_exprs( + frozenset(insn.id for insn in self.instructions), + ignore_auto=ignore_auto) + # }}} # {{{ local memory diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index b82965914a2fd6eabb460e3e68f31131862c6330..988636ebffb318a34dbb61dfdfb1f5d4c523bc77 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -991,7 +991,6 @@ class ArrayBase(Record): # generate stride arguments, yielded later to keep array first for stride_user_axis, stride_impl_axis, stride_unvec_impl_axis \ in stride_arg_axes: - from cgen import Const, POD stride_name = full_name+"_stride%d" % stride_user_axis from pymbolic import var @@ -1004,20 +1003,16 @@ class ArrayBase(Record): target=target, name=stride_name, dtype=index_dtype, - cgen_declarator=Const(POD(index_dtype, stride_name)), arg_class=ValueArg, stride_for_name_and_axis=( - full_name, stride_impl_axis))) + full_name, stride_impl_axis), + is_written=False)) yield ImplementedDataInfo( target=target, name=full_name, base_name=self.name, - # implemented by various argument types - cgen_declarator=self.get_arg_decl( - target, name_suffix, shape, dtype, is_written), - arg_class=type(self), dtype=dtype, shape=shape, @@ -1025,18 +1020,18 @@ class ArrayBase(Record): unvec_shape=unvec_shape, unvec_strides=tuple(unvec_strides), allows_offset=bool(self.offset), - ) + + is_written=is_written) if self.offset: - from cgen import Const, POD offset_name = full_name+"_offset" yield ImplementedDataInfo( target=target, name=offset_name, dtype=index_dtype, - cgen_declarator=Const(POD(index_dtype, offset_name)), arg_class=ValueArg, - offset_for_name=full_name) + offset_for_name=full_name, + is_written=False) for sa in stride_args: yield sa diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index b477b5fee0b047d0fb74099174e2a8c6f08336d1..8de5919df44a6201a4cfb123fa1a34516132634e 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -202,8 +202,8 @@ class GlobalArg(ArrayBase, KernelArgument): min_target_axes = 0 max_target_axes = 1 - def get_arg_decl(self, target, name_suffix, shape, dtype, is_written): - return target.get_global_arg_decl(self.name + name_suffix, shape, + def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): + return ast_builder.get_global_arg_decl(self.name + name_suffix, shape, dtype, is_written) @@ -211,8 +211,8 @@ class ConstantArg(ArrayBase, KernelArgument): min_target_axes = 0 max_target_axes = 1 - def get_arg_decl(self, target, name_suffix, shape, dtype, is_written): - return target.get_constant_arg_decl(self.name + name_suffix, shape, + def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): + return ast_builder.get_constant_arg_decl(self.name + name_suffix, shape, dtype, is_written) @@ -224,8 +224,8 @@ class ImageArg(ArrayBase, KernelArgument): def dimensions(self): return len(self.dim_tags) - def get_arg_decl(self, target, name_suffix, shape, dtype, is_written): - return target.get_image_arg_decl(self.name + name_suffix, shape, + def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): + return ast_builder.get_image_arg_decl(self.name + name_suffix, shape, self.num_target_axes(), dtype, is_written) @@ -258,8 +258,8 @@ class ValueArg(KernelArgument): key_builder.rec(key_hash, self.name) key_builder.rec(key_hash, self.dtype) - def get_arg_decl(self, target): - return target.get_value_arg_decl(self.name, (), + def get_arg_decl(self, ast_builder): + return ast_builder.get_value_arg_decl(self.name, (), self.dtype, False) # }}} diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 0a02979ab32ad45cc4b1f6519c2bd327fa150812..7862ceb1efbed4f70ebe0fa81d07de84b12fca60 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -393,7 +393,8 @@ class DomainChanger: # Changing the domain might look like it wants to change grid # sizes. Not true. - get_grid_sizes=self.kernel.get_grid_sizes) + # (Relevant for 'slab decomposition') + get_grid_sizes_for_insn_ids=self.kernel.get_grid_sizes_for_insn_ids) # }}} @@ -767,7 +768,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): # copies. if local_size is None: - _, local_size = kernel.get_grid_sizes_as_exprs( + _, local_size = kernel.get_grid_size_upper_bounds_as_exprs( ignore_auto=True) # {{{ axis assignment helper function diff --git a/loopy/schedule.py b/loopy/schedule/__init__.py similarity index 97% rename from loopy/schedule.py rename to loopy/schedule/__init__.py index 7ac8778378869f7c9cf6cfbe8d4be145e9fad520..38fc7c5812b9105904a9cb156527bc074fdc6728 100644 --- a/loopy/schedule.py +++ b/loopy/schedule/__init__.py @@ -50,11 +50,19 @@ class ScheduleItem(Record): key_builder.rec(key_hash, getattr(self, field_name)) -class EnterLoop(ScheduleItem): +class BeginBlockItem(ScheduleItem): + pass + + +class EndBlockItem(ScheduleItem): + pass + + +class EnterLoop(BeginBlockItem): hash_fields = __slots__ = ["iname"] -class LeaveLoop(ScheduleItem): +class LeaveLoop(EndBlockItem): hash_fields = __slots__ = ["iname"] @@ -62,11 +70,11 @@ class RunInstruction(ScheduleItem): hash_fields = __slots__ = ["insn_id"] -class CallKernel(ScheduleItem): +class CallKernel(BeginBlockItem): hash_fields = __slots__ = ["kernel_name"] -class ReturnFromKernel(ScheduleItem): +class ReturnFromKernel(EndBlockItem): hash_fields = __slots__ = ["kernel_name"] @@ -87,15 +95,15 @@ class Barrier(ScheduleItem): # {{{ schedule utilities -def gather_schedule_subloop(schedule, start_idx): - assert isinstance(schedule[start_idx], EnterLoop) +def gather_schedule_block(schedule, start_idx): + assert isinstance(schedule[start_idx], BeginBlockItem) level = 0 i = start_idx while i < len(schedule): - if isinstance(schedule[i], EnterLoop): + if isinstance(schedule[i], BeginBlockItem): level += 1 - if isinstance(schedule[i], LeaveLoop): + elif isinstance(schedule[i], EndBlockItem): level -= 1 if level == 0: @@ -107,16 +115,17 @@ def gather_schedule_subloop(schedule, start_idx): def generate_sub_sched_items(schedule, start_idx): - if not isinstance(schedule[start_idx], EnterLoop): + if not isinstance(schedule[start_idx], BeginBlockItem): yield start_idx, schedule[start_idx] level = 0 i = start_idx while i < len(schedule): sched_item = schedule[i] - if isinstance(sched_item, EnterLoop): + if isinstance(sched_item, BeginBlockItem): level += 1 - elif isinstance(sched_item, LeaveLoop): + + elif isinstance(sched_item, EndBlockItem): level -= 1 else: @@ -130,6 +139,14 @@ def generate_sub_sched_items(schedule, start_idx): assert False +def get_insn_ids_for_block_at(schedule, start_idx): + return frozenset( + sub_sched_item.insn_id + for sub_sched_item in generate_sub_sched_items( + schedule, start_idx) + if isinstance(sub_sched_item, RunInstruction)) + + def find_active_inames_at(kernel, sched_index): active_inames = [] @@ -146,8 +163,8 @@ def find_active_inames_at(kernel, sched_index): def has_barrier_within(kernel, sched_index): sched_item = kernel.schedule[sched_index] - if isinstance(sched_item, EnterLoop): - loop_contents, _ = gather_schedule_subloop( + if isinstance(sched_item, BeginBlockItem): + loop_contents, _ = gather_schedule_block( kernel.schedule, sched_index) from pytools import any return any(isinstance(subsched_item, Barrier) @@ -161,8 +178,8 @@ def has_barrier_within(kernel, sched_index): def find_used_inames_within(kernel, sched_index): sched_item = kernel.schedule[sched_index] - if isinstance(sched_item, EnterLoop): - loop_contents, _ = gather_schedule_subloop( + if isinstance(sched_item, BeginBlockItem): + loop_contents, _ = gather_schedule_block( kernel.schedule, sched_index) run_insns = [subsched_item for subsched_item in loop_contents @@ -1259,7 +1276,7 @@ def insert_barriers(kernel, schedule, reverse, kind, level=0): if isinstance(sched_item, EnterLoop): # {{{ recurse for nested loop - subloop, new_i = gather_schedule_subloop(schedule, i) + subloop, new_i = gather_schedule_block(schedule, i) i = new_i # Run barrier insertion for inner loop @@ -1457,7 +1474,7 @@ def generate_loop_schedules(kernel, debug_args={}): for gen_sched in gen: debug.stop() - gsize, lsize = kernel.get_grid_sizes_as_exprs() + gsize, lsize = kernel.get_grid_size_upper_bounds() if gsize or lsize: logger.info("%s: barrier insertion: global" % kernel.name) @@ -1481,9 +1498,15 @@ def generate_loop_schedules(kernel, debug_args={}): logger.info("%s: barrier insertion: done" % kernel.name) - yield kernel.copy( + new_kernel = kernel.copy( schedule=gen_sched, state=kernel_state.SCHEDULED) + + from loopy.codegen.device_mapping import \ + map_schedule_onto_host_or_device + new_kernel = map_schedule_onto_host_or_device(new_kernel) + yield new_kernel + debug.start() schedule_count += 1 diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 219d66d49a47882bf76bd8c89f7298c8597dfa0e..cade5e5e060b26a8c6d44965e61500baa31e5c08 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -33,8 +33,7 @@ from pytools import memoize, memoize_method, Record import pytools.lex from pymbolic.primitives import ( - Leaf, AlgebraicLeaf, Expression, Variable, - CommonSubexpression) + Leaf, AlgebraicLeaf, Expression, Variable, CommonSubexpression) from pymbolic.mapper import ( CombineMapper as CombineMapperBase, @@ -1094,7 +1093,7 @@ def ineq_constraint_from_expr(space, expr): return isl.Constraint.inequality_from_aff(aff_from_expr(space, expr)) -def constraint_to_expr(cns, except_name=None): +def constraint_to_expr(cns): # Looks like this is ok after all--get_aff() performs some magic. # Not entirely sure though... FIXME # @@ -1102,7 +1101,13 @@ def constraint_to_expr(cns, except_name=None): #if ls.dim(dim_type.div): #raise RuntimeError("constraint has an existentially quantified variable") - return aff_to_expr(cns.get_aff(), except_name=except_name) + expr = aff_to_expr(cns.get_aff()) + + from pymbolic.primitives import Comparison + if cns.is_equality(): + return Comparison(expr, "==", 0) + else: + return Comparison(expr, ">=", 0) # }}} diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index cad451c2ed041875f9dd9535d73ce20e8ac9e057..d9f89ca3571b31cb724e5a598918525cf2b4b665 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -29,6 +29,8 @@ __doc__ = """ .. currentmodule:: loopy .. autoclass:: TargetBase +.. autoclass:: ASTBuilderBase + .. autoclass:: CTarget .. autoclass:: CudaTarget .. autoclass:: OpenCLTarget @@ -39,7 +41,7 @@ __doc__ = """ class TargetBase(object): - """Base class for all targets, i.e. different types of code that + """Base class for all targets, i.e. different combinations of code that loopy can generate. Objects of this type must be picklable. @@ -69,19 +71,6 @@ class TargetBase(object): # }}} - # {{{ library - - def function_manglers(self): - return [] - - def symbol_manglers(self): - return [] - - def preamble_generators(self): - return [] - - # }}} - # {{{ top-level codegen def preprocess(self, kernel): @@ -90,11 +79,30 @@ class TargetBase(object): def pre_codegen_check(self, kernel): pass - def generate_code(self, kernel, codegen_state, impl_arg_info): - pass - # }}} + host_program_name_suffix = "_outer" + device_program_name_suffix = "" + + def split_kernel_at_global_barriers(self): + """ + :returns: a :class:`bool` indicating whether the kernel should + be split when a global barrier is encountered. + """ + raise NotImplementedError() + + def get_host_ast_builder(self): + """ + :returns: a class implementing :class:`ASTBuilderBase` for the host code + """ + raise NotImplementedError() + + def get_device_ast_builder(self): + """ + :returns: a class implementing :class:`ASTBuilderBase` for the host code + """ + raise NotImplementedError() + # {{{ types def get_dtype_registry(self): @@ -112,8 +120,47 @@ class TargetBase(object): # }}} + +class ASTBuilderBase(object): + """An interface for generating (host or device) ASTs. + """ + + def __init__(self, target): + self.target = target + + # {{{ library + + def function_manglers(self): + return [] + + def symbol_manglers(self): + return [] + + def preamble_generators(self): + return [] + + # }}} + # {{{ code generation guts + def get_function_definition(self, codegen_state, codegen_result, + schedule_index, function_decl, function_body): + raise NotImplementedError + + def get_function_declaration(self, codegen_state, codegen_result, + schedule_index): + raise NotImplementedError + + def get_temporary_decls(self, codegen_state): + raise NotImplementedError + + def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): + raise NotImplementedError + + @property + def ast_block_class(self): + raise NotImplementedError() + def get_expression_to_code_mapper(self, codegen_state): raise NotImplementedError() @@ -123,7 +170,6 @@ class TargetBase(object): def emit_barrier(self, kind, comment): """ :arg kind: ``"local"`` or ``"global"`` - :return: a :class:`loopy.codegen.GeneratedInstruction`. """ raise NotImplementedError() @@ -133,13 +179,101 @@ class TargetBase(object): def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): raise NotImplementedError() - def generate_multiple_assignment(self, codegen_state, insn): + def emit_assignment(self, codegen_state, lhs, rhs): raise NotImplementedError() - def generate_atomic_update(self, kernel, codegen_state, lhs_atomicity, lhs_var, + def emit_multiple_assignment(self, codegen_state, insn): + raise NotImplementedError() + + def emit_atomic_update(self, kernel, codegen_state, lhs_atomicity, lhs_var, lhs_expr, rhs_expr, lhs_dtype): raise NotImplementedError("atomic update in target %s" % type(self).__name__) + def emit_sequential_loop(self, codegen_state, iname, iname_dtype, + static_lbound, static_ubound, inner): + raise NotImplementedError() + + def emit_if(self, condition_str, ast): + raise NotImplementedError() + + def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): + raise NotImplementedError() + + def emit_blank_line(self): + raise NotImplementedError() + + def emit_comment(self, s): + raise NotImplementedError() + # }}} + +# {{{ dummy host ast builder + +class _DummyExpressionToCodeMapper(object): + def rec(self, expr, prec, type_context=None, needed_dtype=None): + return "" + + __call__ = rec + + +class _DummyASTBlock(object): + def __init__(self, arg): + self.contents = [] + + def __str__(self): + return "" + + +class DummyHostASTBuilder(ASTBuilderBase): + def get_function_definition(self, codegen_state, codegen_result, + schedule_index, function_decl, function_body): + return function_body + + def get_function_declaration(self, codegen_state, codegen_result, + schedule_index): + return None + + def get_temporary_decls(self, codegen_state): + return [] + + def get_expression_to_code_mapper(self, codegen_state): + return _DummyExpressionToCodeMapper() + + def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): + return None + + @property + def ast_block_class(self): + return _DummyASTBlock + + def emit_assignment(self, codegen_state, lhs, rhs): + return None + + def emit_multiple_assignment(self, codegen_state, insn): + return None + + def emit_atomic_update(self, kernel, codegen_state, lhs_atomicity, lhs_var, + lhs_expr, rhs_expr, lhs_dtype): + return None + + def emit_sequential_loop(self, codegen_state, iname, iname_dtype, + static_lbound, static_ubound, inner): + return None + + def emit_if(self, condition_str, ast): + return None + + def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): + return None + + def emit_blank_line(self): + return None + + def emit_comment(self, s): + return None + +# }}} + + # vim: foldmethod=marker diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 5e35075623c346f908468ae3855f78883da8d652..f87b999e32890b28043fd3c6eeafc6488eebafc2 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -27,7 +27,7 @@ THE SOFTWARE. import six import numpy as np # noqa -from loopy.target import TargetBase +from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder from loopy.diagnostic import LoopyError from pytools import memoize_method @@ -91,6 +91,43 @@ def _preamble_generator(preamble_info): # }}} +# {{{ cgen overrides + +from cgen import Declarator + + +class POD(Declarator): + """A simple declarator: The type is given as a :class:`numpy.dtype` + and the *name* is given as a string. + """ + + def __init__(self, ast_builder, dtype, name): + from loopy.types import LoopyType + assert isinstance(dtype, LoopyType) + + self.ast_builder = ast_builder + self.ctype = ast_builder.target.dtype_to_typename(dtype) + self.dtype = dtype + self.name = name + + def get_decl_pair(self): + return [self.ctype], self.name + + def struct_maker_code(self, name): + return name + + def struct_format(self): + return self.dtype.char + + def alignment_requirement(self): + return self.ast_builder.target.alignment_requirement(self) + + def default_value(self): + return 0 + +# }}} + + class CTarget(TargetBase): """A target for plain "C", without any parallel extensions. """ @@ -102,6 +139,15 @@ class CTarget(TargetBase): self.fortran_abi = fortran_abi super(CTarget, self).__init__() + def split_kernel_at_global_barriers(self): + return False + + def get_host_ast_builder(self): + return DummyHostASTBuilder(self) + + def get_device_ast_builder(self): + return CASTBuilder(self) + # {{{ types @memoize_method @@ -129,11 +175,13 @@ class CTarget(TargetBase): # }}} + +class CASTBuilder(ASTBuilderBase): # {{{ library def preamble_generators(self): return ( - super(CTarget, self).preamble_generators() + [ + super(CASTBuilder, self).preamble_generators() + [ _preamble_generator, ]) @@ -141,35 +189,47 @@ class CTarget(TargetBase): # {{{ code generation - def generate_code(self, kernel, codegen_state, impl_arg_info): - from cgen import FunctionBody, FunctionDeclaration, Value, Module + def get_function_definition(self, codegen_state, codegen_result, + schedule_index, + function_decl, function_body): + from cgen import FunctionBody + return FunctionBody(function_decl, function_body) - body, implemented_domains = kernel.target.generate_body( - kernel, codegen_state) + def idi_to_cgen_declarator(self, kernel, idi): + if (idi.offset_for_name is not None + or idi.stride_for_name_and_axis is not None): + assert not idi.is_written + from cgen import Const + return Const(POD(self, idi.dtype, idi.name)) + else: + name = idi.base_name or idi.name + arg = kernel.arg_dict[name] + from loopy.kernel.data import ArrayBase + if isinstance(arg, ArrayBase): + return arg.get_arg_decl( + self, + idi.name[len(name):], idi.shape, idi.dtype, + idi.is_written) + else: + return arg.get_arg_decl(self) - name = kernel.name - if self.fortran_abi: + def get_function_declaration(self, codegen_state, codegen_result, + schedule_index): + from cgen import FunctionDeclaration, Value + + name = codegen_result.current_program(codegen_state).name + if self.target.fortran_abi: name += "_" - mod = Module([ - FunctionBody( - kernel.target.wrap_function_declaration( - kernel, - FunctionDeclaration( + return FunctionDeclaration( Value("void", name), - [iai.cgen_declarator for iai in impl_arg_info])), - body) - ]) - - return str(mod), implemented_domains + [self.idi_to_cgen_declarator(codegen_state.kernel, idi) + for idi in codegen_state.implemented_data_info]) - def wrap_function_declaration(self, kernel, fdecl): - return fdecl - - def generate_body(self, kernel, codegen_state): - from cgen import Block - body = Block() + def get_temporary_decls(self, codegen_state): + kernel = codegen_state.kernel + base_storage_decls = [] temp_decls = [] # {{{ declare temporaries @@ -179,7 +239,6 @@ class CTarget(TargetBase): base_storage_to_align_bytes = {} from cgen import ArrayOf, Pointer, Initializer, AlignedAttribute, Value - from loopy.codegen import POD # uses the correct complex type class ConstRestrictPointer(Pointer): def get_decl_pair(self): @@ -189,7 +248,7 @@ class CTarget(TargetBase): for tv in sorted( six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): - decl_info = tv.decl_info(self, index_dtype=kernel.index_dtype) + decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype) if not tv.base_storage: for idi in decl_info: @@ -255,35 +314,27 @@ class CTarget(TargetBase): alignment = max(base_storage_to_align_bytes[bs_name]) bs_var_decl = AlignedAttribute(alignment, bs_var_decl) - body.append(bs_var_decl) - - body.extend(temp_decls) + base_storage_decls.append(bs_var_decl) # }}} - from loopy.codegen.loop import set_up_hw_parallel_loops - gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state) - - from cgen import Line - body.append(Line()) - - if isinstance(gen_code.ast, Block): - body.extend(gen_code.ast.contents) - else: - body.append(gen_code.ast) + return base_storage_decls + temp_decls - return body, gen_code.implemented_domains + @property + def ast_block_class(self): + from cgen import Block + return Block # }}} # {{{ code generation guts def get_expression_to_code_mapper(self, codegen_state): - from loopy.target.c.codegen.expression import LoopyCCodeMapper - return LoopyCCodeMapper(codegen_state, fortran_abi=self.fortran_abi) + from loopy.target.c.codegen.expression import ExpressionToCMapper + return ExpressionToCMapper( + codegen_state, fortran_abi=self.target.fortran_abi) def get_temporary_decl(self, knl, temp_var, decl_info): - from loopy.codegen import POD # uses the correct complex type temp_var_decl = POD(self, decl_info.dtype, decl_info.name) if decl_info.shape: @@ -299,20 +350,18 @@ class CTarget(TargetBase): def get_value_arg_decl(self, name, shape, dtype, is_written): assert shape == () - from loopy.codegen import POD # uses the correct complex type result = POD(self, dtype, name) if not is_written: from cgen import Const result = Const(result) - if self.fortran_abi: + if self.target.fortran_abi: from cgen import Pointer result = Pointer(result) return result def get_global_arg_decl(self, name, shape, dtype, is_written): - from loopy.codegen import POD # uses the correct complex type from cgen import RestrictPointer, Const arg_decl = RestrictPointer(POD(self, dtype, name)) @@ -322,26 +371,11 @@ class CTarget(TargetBase): return arg_decl - def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - static_lbound, static_ubound, inner): - ecm = codegen_state.expression_to_code_mapper - - from loopy.symbolic import aff_to_expr - - from loopy.codegen import wrap_in - from pymbolic.mapper.stringifier import PREC_NONE - from cgen import For - - return wrap_in(For, - "%s %s = %s" - % (self.dtype_to_typename(iname_dtype), - iname, ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), - "%s <= %s" % ( - iname, ecm(aff_to_expr(static_ubound), PREC_NONE, "i")), - "++%s" % iname, - inner) + def emit_assignment(self, codegen_state, lhs, rhs): + from cgen import Assign + return Assign(lhs, rhs) - def generate_multiple_assignment(self, codegen_state, insn): + def emit_multiple_assignment(self, codegen_state, insn): ecm = codegen_state.expression_to_code_mapper from pymbolic.primitives import Variable @@ -371,7 +405,7 @@ class CTarget(TargetBase): from loopy.expression import dtype_to_type_context str_parameters = [ ecm(par, PREC_NONE, - dtype_to_type_context(self, tgt_dtype), + dtype_to_type_context(self.target, tgt_dtype), tgt_dtype) for par, par_dtype, tgt_dtype in zip( parameters, par_dtypes, mangle_result.arg_dtypes)] @@ -389,7 +423,7 @@ class CTarget(TargetBase): "side of instruction '%s'" % (i+1, insn.id)) str_parameters.append( "&(%s)" % ecm(a, PREC_NONE, - dtype_to_type_context(self, tgt_dtype), + dtype_to_type_context(self.target, tgt_dtype), tgt_dtype)) result = "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters)) @@ -406,6 +440,46 @@ class CTarget(TargetBase): lhs_code, result) + def emit_sequential_loop(self, codegen_state, iname, iname_dtype, + static_lbound, static_ubound, inner): + ecm = codegen_state.expression_to_code_mapper + + from loopy.symbolic import aff_to_expr + + from pymbolic.mapper.stringifier import PREC_NONE + from cgen import For + + return For( + "%s %s = %s" + % (self.target.dtype_to_typename(iname_dtype), + iname, ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), + "%s <= %s" % ( + iname, ecm(aff_to_expr(static_ubound), PREC_NONE, "i")), + "++%s" % iname, + inner) + + def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): + decl = POD(self, dtype, name) + + from cgen import Initializer, Const + + if is_const: + decl = Const(decl) + + return Initializer(decl, val_str) + + def emit_blank_line(self): + from cgen import Line + return Line() + + def emit_comment(self, s): + from cgen import Comment + return Comment(s) + + def emit_if(self, condition_str, ast): + from cgen import If + return If(condition_str, ast) + # }}} # vim: foldmethod=marker diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 11686dcdc3b691dbab6dbdc716729637e6dc96b7..58cf06e1250477b9ff8d41a81c9f34565b7d30f5 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -29,7 +29,8 @@ import numpy as np from pymbolic.mapper import RecursiveMapper from pymbolic.mapper.stringifier import (PREC_NONE, PREC_CALL, PREC_PRODUCT, - PREC_POWER) + PREC_POWER, + PREC_UNARY, PREC_LOGICAL_OR, PREC_LOGICAL_AND) import islpy as isl from loopy.expression import dtype_to_type_context, TypeInferenceMapper @@ -41,7 +42,7 @@ from loopy.types import LoopyType # {{{ C code mapper -class LoopyCCodeMapper(RecursiveMapper): +class ExpressionToCMapper(RecursiveMapper): def __init__(self, codegen_state, fortran_abi=False, type_inf_mapper=None): self.kernel = codegen_state.kernel self.codegen_state = codegen_state @@ -149,7 +150,7 @@ class LoopyCCodeMapper(RecursiveMapper): if isinstance(arg, ValueArg) and self.fortran_abi: prefix = "*" - result = self.kernel.mangle_symbol(expr.name) + result = self.kernel.mangle_symbol(self.codegen_state.ast_builder, expr.name) if result is not None: _, c_name = result return prefix + c_name @@ -223,7 +224,7 @@ class LoopyCCodeMapper(RecursiveMapper): enclosing_prec, PREC_CALL) if access_info.vector_index is not None: - return self.kernel.target.add_vector_access( + return self.codegen_state.ast_builder.add_vector_access( result, access_info.vector_index) else: return result @@ -427,7 +428,10 @@ class LoopyCCodeMapper(RecursiveMapper): str_parameters = None - mangle_result = self.kernel.mangle_function(identifier, par_dtypes) + mangle_result = self.kernel.mangle_function( + identifier, par_dtypes, + ast_builder=self.codegen_state.ast_builder) + if mangle_result is None: raise RuntimeError("function '%s' unknown--" "maybe you need to register a function mangler?" @@ -468,6 +472,21 @@ class LoopyCCodeMapper(RecursiveMapper): return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters)) + def map_logical_not(self, expr, enclosing_prec, type_context): + return self.parenthesize_if_needed( + "!" + self.rec(expr.child, PREC_UNARY, type_context), + enclosing_prec, PREC_UNARY) + + def map_logical_and(self, expr, enclosing_prec, type_context): + return self.parenthesize_if_needed( + self.join_rec(" && ", expr.children, PREC_LOGICAL_AND, type_context), + enclosing_prec, PREC_LOGICAL_AND) + + def map_logical_or(self, expr, enclosing_prec, type_context): + return self.parenthesize_if_needed( + self.join_rec(" || ", expr.children, PREC_LOGICAL_OR, type_context), + enclosing_prec, PREC_LOGICAL_OR) + # {{{ deal with complex-valued variables def complex_type_name(self, dtype): diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 93b5da96fc2380d34beea350667adafc23028fac..9e9d652e1fa04066493ee14dd562b7bca1966766 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -28,8 +28,8 @@ import numpy as np from pytools import memoize_method -from loopy.target.c import CTarget -from loopy.target.c.codegen.expression import LoopyCCodeMapper +from loopy.target.c import CTarget, CASTBuilder +from loopy.target.c.codegen.expression import ExpressionToCMapper from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.kernel.data import temp_var_scope @@ -139,7 +139,7 @@ def cuda_function_mangler(kernel, name, arg_dtypes): # {{{ expression mapper -class LoopyCudaCCodeMapper(LoopyCCodeMapper): +class ExpressionToCudaCMapper(ExpressionToCMapper): _GRID_AXES = "xyz" @staticmethod @@ -178,15 +178,10 @@ class CudaTarget(CTarget): super(CudaTarget, self).__init__() - # {{{ library + def get_device_ast_builder(self): + return CUDACASTBuilder(self) - def function_manglers(self): - return ( - super(CudaTarget, self).function_manglers() + [ - cuda_function_mangler - ]) - - # }}} + # {{{ types @memoize_method def get_dtype_registry(self): @@ -213,17 +208,41 @@ class CudaTarget(CTarget): # }}} +# }}} + + +# {{{ ast builder + +class CUDACASTBuilder(CASTBuilder): + # {{{ library + + def function_manglers(self): + return ( + super(CudaTarget, self).function_manglers() + [ + cuda_function_mangler + ]) + + # }}} + # {{{ top-level codegen - def wrap_function_declaration(self, kernel, fdecl): + def get_function_declaration(self, codegen_state, codegen_result, + schedule_index): + fdecl = super(CUDACASTBuilder, self).get_function_declaration( + codegen_state, codegen_result, schedule_index) + from cgen.cuda import CudaGlobal, CudaLaunchBounds fdecl = CudaGlobal(fdecl) - if self.extern_c: + if self.target.extern_c: from cgen import Extern fdecl = Extern("C", fdecl) - _, local_grid_size = kernel.get_grid_sizes_as_exprs() + from loopy.schedule import get_insn_ids_for_block_at + _, local_grid_size = \ + codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( + get_insn_ids_for_block_at( + codegen_state.kernel.schedule, schedule_index)) from loopy.symbolic import get_dependencies if not get_dependencies(local_grid_size): @@ -259,7 +278,7 @@ class CudaTarget(CTarget): # {{{ code generation guts def get_expression_to_code_mapper(self, codegen_state): - return LoopyCudaCCodeMapper(codegen_state) + return ExpressionToCudaCMapper(codegen_state) _VEC_AXES = "xyzw" @@ -275,11 +294,8 @@ class CudaTarget(CTarget): if comment: comment = " /* %s */" % comment - from loopy.codegen import GeneratedInstruction from cgen import Statement - return GeneratedInstruction( - ast=Statement("__syncthreads()%s" % comment), - implemented_domain=None) + return Statement("__syncthreads()%s" % comment) elif kind == "global": raise LoopyError("CUDA does not have global barriers") else: @@ -296,7 +312,7 @@ class CudaTarget(CTarget): % scope) def get_global_arg_decl(self, name, shape, dtype, is_written): - from loopy.codegen import POD # uses the correct complex type + from loopy.target.c import POD # uses the correct complex type from cgen import Const from cgen.cuda import CudaRestrictPointer @@ -311,7 +327,7 @@ class CudaTarget(CTarget): raise NotImplementedError("not yet: texture arguments in CUDA") def get_constant_arg_decl(self, name, shape, dtype, is_written): - from loopy.codegen import POD # uses the correct complex type + from loopy.target.c import POD # uses the correct complex type from cgen import RestrictPointer, Const from cgen.cuda import CudaConstant @@ -324,6 +340,4 @@ class CudaTarget(CTarget): # }}} -# }}} - # vim: foldmethod=marker diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 3a3346198bb8448e571d922dfce31642f272a3e4..896ea9158223435e3bef933818fbf3bc51a424b4 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -26,8 +26,8 @@ THE SOFTWARE. import numpy as np # noqa -from loopy.target.c import CTarget -from loopy.target.c.codegen.expression import LoopyCCodeMapper +from loopy.target.c import CTarget, CASTBuilder +from loopy.target.c.codegen.expression import ExpressionToCMapper from loopy.diagnostic import LoopyError from pymbolic.mapper.stringifier import (PREC_SUM, PREC_CALL) @@ -36,7 +36,7 @@ from pytools import memoize_method # {{{ expression mapper -class LoopyISPCCodeMapper(LoopyCCodeMapper): +class ExprToISPCMapper(ExpressionToCMapper): def _get_index_ctype(self): if self.kernel.index_dtype.numpy_dtype == np.int32: return "int32" @@ -81,7 +81,7 @@ class LoopyISPCCodeMapper(LoopyCCodeMapper): else: return expr.name else: - return super(LoopyISPCCodeMapper, self).map_variable( + return super(ExprToISPCMapper, self).map_variable( expr, enclosing_prec, type_context) def map_subscript(self, expr, enclosing_prec, type_context): @@ -90,7 +90,7 @@ class LoopyISPCCodeMapper(LoopyCCodeMapper): ary = self.find_array(expr) if isinstance(ary, TemporaryVariable): - gsize, lsize = self.kernel.get_grid_sizes_as_exprs() + gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs() if lsize: lsize, = lsize from loopy.kernel.array import get_access_info @@ -113,7 +113,7 @@ class LoopyISPCCodeMapper(LoopyCCodeMapper): else: return result - return super(LoopyISPCCodeMapper, self).map_subscript( + return super(ExprToISPCMapper, self).map_subscript( expr, enclosing_prec, type_context) # }}} @@ -157,6 +157,24 @@ class ISPCTarget(CTarget): super(ISPCTarget, self).__init__() + host_program_name_suffix = "" + device_program_name_suffix = "_inner" + + def pre_codegen_check(self, kernel): + gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs() + if len(lsize) > 1: + for i, ls_i in enumerate(lsize[1:]): + if ls_i != 1: + raise LoopyError("local axis %d (0-based) " + "has length > 1, which is unsupported " + "by ISPC" % ls_i) + + def get_host_ast_builder(self): + return ISPCASTBuilder(self) + + def get_device_ast_builder(self): + return ISPCASTBuilder(self) + # {{{ types @memoize_method @@ -169,23 +187,20 @@ class ISPCTarget(CTarget): # }}} - # {{{ top-level codegen - def generate_code(self, kernel, codegen_state, impl_arg_info): - from cgen import (FunctionBody, FunctionDeclaration, Value, Module, - Block, Line, Statement as S) - from cgen.ispc import ISPCExport, ISPCTask +class ISPCASTBuilder(CASTBuilder): + def _arg_names_and_decls(self, codegen_state): + implemented_data_info = codegen_state.implemented_data_info + arg_names = [iai.name for iai in implemented_data_info] - knl_body, implemented_domains = kernel.target.generate_body( - kernel, codegen_state) - - inner_name = "lp_ispc_inner_"+kernel.name - arg_decls = [iai.cgen_declarator for iai in impl_arg_info] - arg_names = [iai.name for iai in impl_arg_info] + arg_decls = [ + self.idi_to_cgen_declarator(codegen_state.kernel, idi) + for idi in implemented_data_info] # {{{ occa compatibility hackery - if self.occa_mode: + from cgen import Value + if self.target.occa_mode: from cgen import ArrayOf, Const from cgen.ispc import ISPCUniform @@ -199,97 +214,85 @@ class ISPCTarget(CTarget): # }}} - knl_fbody = FunctionBody( - ISPCTask( - FunctionDeclaration( - Value("void", inner_name), - arg_decls)), - knl_body) + return arg_names, arg_decls - # {{{ generate wrapper + # {{{ top-level codegen - wrapper_body = Block() + def get_function_declaration(self, codegen_state, codegen_result, + schedule_index): + name = codegen_result.current_program(codegen_state).name - gsize, lsize = kernel.get_grid_sizes_as_exprs() - if len(lsize) > 1: - for i, ls_i in enumerate(lsize[1:]): - if ls_i != 1: - raise LoopyError("local axis %d (0-based) " - "has length > 1, which is unsupported " - "by ISPC" % ls_i) + from cgen import (FunctionDeclaration, Value) + from cgen.ispc import ISPCExport, ISPCTask - from pymbolic.mapper.stringifier import PREC_COMPARISON, PREC_NONE - ccm = self.get_expression_to_code_mapper(codegen_state) + arg_names, arg_decls = self._arg_names_and_decls(codegen_state) + + if codegen_state.is_generating_device_code: + return ISPCTask( + FunctionDeclaration( + Value("void", name), + arg_decls)) + else: + return ISPCExport( + FunctionDeclaration( + Value("void", name), + arg_decls)) + + # }}} + + def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): + ecm = self.get_expression_to_code_mapper(codegen_state) + from pymbolic.mapper.stringifier import PREC_COMPARISON, PREC_NONE + result = [] + from cgen import Statement as S, Block if lsize: - wrapper_body.append( + result.append( S("assert(programCount == %s)" - % ccm(lsize[0], PREC_COMPARISON))) + % ecm(lsize[0], PREC_COMPARISON))) if gsize: launch_spec = "[%s]" % ", ".join( - ccm(gs_i, PREC_NONE) + ecm(gs_i, PREC_NONE) for gs_i in gsize) else: launch_spec = "" - wrapper_body.append( - S("launch%s %s(%s)" - % ( - launch_spec, - inner_name, - ", ".join(arg_names) - )) - ) - - wrapper_fbody = FunctionBody( - ISPCExport( - FunctionDeclaration( - Value("void", kernel.name), - arg_decls)), - wrapper_body) + arg_names, arg_decls = self._arg_names_and_decls(codegen_state) - # }}} + result.append(S( + "launch%s %s(%s)" % ( + launch_spec, + name, + ", ".join(arg_names) + ))) - mod = Module([ - knl_fbody, - Line(), - wrapper_fbody, - ]) - - return str(mod), implemented_domains - - # }}} + return Block(result) # {{{ code generation guts def get_expression_to_code_mapper(self, codegen_state): - return LoopyISPCCodeMapper(codegen_state) + return ExprToISPCMapper(codegen_state) def add_vector_access(self, access_str, index): return "(%s)[%d]" % (access_str, index) def emit_barrier(self, kind, comment): - from loopy.codegen import GeneratedInstruction from cgen import Comment, Statement assert comment if kind == "local": - return GeneratedInstruction( - ast=Comment("local barrier: %s" % comment), - implemented_domain=None) + return Comment("local barrier: %s" % comment) elif kind == "global": - return GeneratedInstruction( - ast=Statement("sync; /* %s */" % comment), - implemented_domain=None) + return Statement("sync; /* %s */" % comment) else: raise LoopyError("unknown barrier kind") def get_temporary_decl(self, knl, temp_var, decl_info): - from loopy.codegen import POD # uses the correct complex type + from loopy.target.c import POD # uses the correct complex type temp_var_decl = POD(self, decl_info.dtype, decl_info.name) shape = decl_info.shape @@ -311,7 +314,7 @@ class ISPCTarget(CTarget): return ISPCUniform(decl) def get_global_arg_decl(self, name, shape, dtype, is_written): - from loopy.codegen import POD # uses the correct complex type + from loopy.target.c import POD # uses the correct complex type from cgen import Const from cgen.ispc import ISPCUniformPointer, ISPCUniform @@ -325,7 +328,7 @@ class ISPCTarget(CTarget): return arg_decl def get_value_arg_decl(self, name, shape, dtype, is_written): - result = super(ISPCTarget, self).get_value_arg_decl( + result = super(ISPCASTBuilder, self).get_value_arg_decl( name, shape, dtype, is_written) from cgen import Reference, Const @@ -334,7 +337,7 @@ class ISPCTarget(CTarget): if was_const: result = result.subdecl - if self.occa_mode: + if self.target.occa_mode: result = Reference(result) if was_const: @@ -349,13 +352,12 @@ class ISPCTarget(CTarget): from loopy.symbolic import aff_to_expr - from loopy.codegen import wrap_in from pymbolic.mapper.stringifier import PREC_NONE from cgen import For - return wrap_in(For, + return For( "uniform %s %s = %s" - % (self.dtype_to_typename(iname_dtype), + % (self.target.dtype_to_typename(iname_dtype), iname, ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), "%s <= %s" % ( iname, ecm(aff_to_expr(static_ubound), PREC_NONE, "i")), @@ -363,6 +365,7 @@ class ISPCTarget(CTarget): inner) # }}} + # TODO: Generate launch code # TODO: Vector types (element access: done) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index e839801fdfebaa40f6431db947200c32f880ee53..3f7b199ecb0b9e4c2653753ab29baff4f7ad4c46 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -26,8 +26,8 @@ THE SOFTWARE. import numpy as np -from loopy.target.c import CTarget -from loopy.target.c.codegen.expression import LoopyCCodeMapper +from loopy.target.c import CTarget, CASTBuilder +from loopy.target.c.codegen.expression import ExpressionToCMapper from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType @@ -272,12 +272,21 @@ def opencl_preamble_generator(preamble_info): #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable """) + from loopy.tools import remove_common_indentation + kernel = preamble_info.kernel + yield ("00_declare_gid_lid", + remove_common_indentation(""" + #define lid(N) ((%(idx_ctype)s) get_local_id(N)) + #define gid(N) ((%(idx_ctype)s) get_group_id(N)) + """ % dict(idx_ctype=kernel.target.dtype_to_typename( + kernel.index_dtype)))) + # }}} # {{{ expression mapper -class LoopyOpenCLCCodeMapper(LoopyCCodeMapper): +class ExpressionToOpenCLCMapper(ExpressionToCMapper): def map_group_hw_index(self, expr, enclosing_prec, type_context): return "gid(%d)" % expr.axis @@ -307,29 +316,8 @@ class OpenCLTarget(CTarget): self.atomics_flavor = atomics_flavor - # {{{ library - - def function_manglers(self): - return ( - super(OpenCLTarget, self).function_manglers() + [ - opencl_function_mangler - ]) - - def symbol_manglers(self): - return ( - super(OpenCLTarget, self).symbol_manglers() + [ - opencl_symbol_mangler - ]) - - def preamble_generators(self): - from loopy.library.reduction import reduction_preamble_generator - return ( - super(OpenCLTarget, self).preamble_generators() + [ - opencl_preamble_generator, - reduction_preamble_generator - ]) - - # }}} + def get_device_ast_builder(self): + return OpenCLCASTBuilder(self) @memoize_method def get_dtype_registry(self): @@ -359,13 +347,50 @@ class OpenCLTarget(CTarget): # }}} +# }}} + + +# {{{ ast builder + +class OpenCLCASTBuilder(CASTBuilder): + # {{{ library + + def function_manglers(self): + return ( + super(OpenCLCASTBuilder, self).function_manglers() + [ + opencl_function_mangler + ]) + + def symbol_manglers(self): + return ( + super(OpenCLCASTBuilder, self).symbol_manglers() + [ + opencl_symbol_mangler + ]) + + def preamble_generators(self): + from loopy.library.reduction import reduction_preamble_generator + return ( + super(OpenCLCASTBuilder, self).preamble_generators() + [ + opencl_preamble_generator, + reduction_preamble_generator + ]) + + # }}} + # {{{ top-level codegen - def wrap_function_declaration(self, kernel, fdecl): + def get_function_declaration(self, codegen_state, codegen_result, + schedule_index): + fdecl = super(OpenCLCASTBuilder, self).get_function_declaration( + codegen_state, codegen_result, schedule_index) + from cgen.opencl import CLKernel, CLRequiredWorkGroupSize fdecl = CLKernel(fdecl) - _, local_sizes = kernel.get_grid_sizes_as_exprs() + from loopy.schedule import get_insn_ids_for_block_at + _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( + get_insn_ids_for_block_at( + codegen_state.kernel.schedule, schedule_index)) from loopy.symbolic import get_dependencies if not get_dependencies(local_sizes): @@ -376,25 +401,9 @@ class OpenCLTarget(CTarget): return fdecl - def generate_code(self, kernel, codegen_state, impl_arg_info): - code, implemented_domains = ( - super(OpenCLTarget, self).generate_code( - kernel, codegen_state, impl_arg_info)) - - from loopy.tools import remove_common_indentation - code = ( - remove_common_indentation(""" - #define lid(N) ((%(idx_ctype)s) get_local_id(N)) - #define gid(N) ((%(idx_ctype)s) get_group_id(N)) - """ % dict(idx_ctype=self.dtype_to_typename(kernel.index_dtype))) - + "\n\n" - + code) - - return code, implemented_domains - def generate_body(self, kernel, codegen_state): body, implemented_domains = ( - super(OpenCLTarget, self).generate_body(kernel, codegen_state)) + super(OpenCLCASTBuilder, self).generate_body(kernel, codegen_state)) from loopy.kernel.data import ImageArg @@ -412,7 +421,7 @@ class OpenCLTarget(CTarget): # {{{ code generation guts def get_expression_to_code_mapper(self, codegen_state): - return LoopyOpenCLCCodeMapper(codegen_state) + return ExpressionToOpenCLCMapper(codegen_state) def add_vector_access(self, access_str, index): # The 'int' avoids an 'L' suffix for long ints. @@ -427,11 +436,8 @@ class OpenCLTarget(CTarget): if comment: comment = " /* %s */" % comment - from loopy.codegen import GeneratedInstruction from cgen import Statement - return GeneratedInstruction( - ast=Statement("barrier(CLK_LOCAL_MEM_FENCE)%s" % comment), - implemented_domain=None) + return Statement("barrier(CLK_LOCAL_MEM_FENCE)%s" % comment) elif kind == "global": raise LoopyError("OpenCL does not have global barriers") else: @@ -450,7 +456,7 @@ class OpenCLTarget(CTarget): def get_global_arg_decl(self, name, shape, dtype, is_written): from cgen.opencl import CLGlobal - return CLGlobal(super(OpenCLTarget, self).get_global_arg_decl( + return CLGlobal(super(OpenCLCASTBuilder, self).get_global_arg_decl( name, shape, dtype, is_written)) def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): @@ -486,7 +492,7 @@ class OpenCLTarget(CTarget): if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [ np.int32, np.int64, np.float32, np.float64]: from cgen import Block, DoWhile, Assign - from loopy.codegen import POD + from loopy.target.c import POD old_val_var = codegen_state.var_name_generator("loopy_old_val") new_val_var = codegen_state.var_name_generator("loopy_new_val") diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 44b057f1a713a09bd2f493c8f3be67e790049e6e..806c5c26334f82ee8a57c40ffa7674e082600fab 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -2,6 +2,8 @@ from __future__ import division, absolute_import +import sys + __copyright__ = "Copyright (C) 2015 Andreas Kloeckner" __license__ = """ @@ -30,8 +32,11 @@ from six.moves import range import numpy as np from loopy.kernel.data import CallMangleInfo -from loopy.target.opencl import OpenCLTarget +from loopy.target.opencl import OpenCLTarget, OpenCLCASTBuilder +from loopy.target.python import PythonASTBuilderBase from loopy.types import NumpyType +from loopy.diagnostic import LoopyError +from warnings import warn import logging logger = logging.getLogger(__name__) @@ -139,7 +144,7 @@ def check_sizes(kernel, device): if isinstance(arg, lp.ValueArg) and arg.approximately is not None: parameters[arg.name] = arg.approximately - glens, llens = kernel.get_grid_sizes_as_exprs() + glens, llens = kernel.get_grid_size_upper_bounds_as_exprs() if (max(len(glens), len(llens)) > device.max_work_item_dimensions): @@ -264,31 +269,48 @@ class PyOpenCLTarget(OpenCLTarget): warnings) and support for complex numbers. """ - def __init__(self, device=None): + def __init__(self, device=None, pyopencl_module_name="_lpy_cl"): # This ensures the dtype registry is populated. import pyopencl.tools # noqa super(PyOpenCLTarget, self).__init__() self.device = device + self.pyopencl_module_name = pyopencl_module_name - hash_fields = ["device"] comparison_fields = ["device"] - def function_manglers(self): - from loopy.library.random123 import random123_function_mangler - return ( - super(PyOpenCLTarget, self).function_manglers() + [ - pyopencl_function_mangler, - random123_function_mangler - ]) + def update_persistent_hash(self, key_hash, key_builder): + super(PyOpenCLTarget, self).update_persistent_hash(key_hash, key_builder) + key_builder.rec(key_hash, getattr(self.device, "persistent_unique_id", None)) - def preamble_generators(self): - from loopy.library.random123 import random123_preamble_generator - return ([ - pyopencl_preamble_generator, - random123_preamble_generator, - ] + super(PyOpenCLTarget, self).preamble_generators()) + def __getstate__(self): + dev_id = None + if self.device is not None: + dev_id = self.device.persistent_unique_id + + return {"device_id": dev_id, "atomics_flavor": self.atomics_flavor} + + def __setstate__(self, state): + self.atomics_flavor = state["atomics_flavor"] + + dev_id = state["device_id"] + if dev_id is None: + self.device = None + else: + import pyopencl as cl + matches = [ + dev + for plat in cl.get_platforms() + for dev in plat.get_devices() + if dev.persistent_unique_id == dev_id] + + if matches: + self.device = matches[0] + else: + raise LoopyError( + "cannot unpickle device '%s': not found" + % dev_id) def preprocess(self, kernel): return kernel @@ -296,6 +318,14 @@ class PyOpenCLTarget(OpenCLTarget): def pre_codegen_check(self, kernel): check_sizes(kernel, self.device) + def get_host_ast_builder(self): + return PyOpenCLPythonASTBuilder(self) + + def get_device_ast_builder(self): + return PyOpenCLCASTBuilder(self) + + # {{{ types + def get_dtype_registry(self): try: from pyopencl.compyte.dtypes import TYPE_REGISTRY @@ -330,7 +360,294 @@ class PyOpenCLTarget(OpenCLTarget): return struct.calcsize(fmt) + # }}} + +# }}} + + +# {{{ host code: value arg setup + +def generate_value_arg_setup(kernel, devices, implemented_data_info): + options = kernel.options + + import loopy as lp + from loopy.kernel.array import ArrayBase + + # {{{ arg counting bug handling + + # For example: + # https://github.com/pocl/pocl/issues/197 + # (but Apple CPU has a similar bug) + + work_around_arg_count_bug = False + warn_about_arg_count_bug = False + + try: + from pyopencl.characterize import has_struct_arg_count_bug + + except ImportError: + count_bug_per_dev = [False]*len(devices) + + else: + count_bug_per_dev = [ + has_struct_arg_count_bug(dev) + if dev is not None else False + for dev in devices] + + if any(dev is None for dev in devices): + warn("{knl_name}: device not supplied to PyOpenCLTarget--" + "workarounds for broken OpenCL implementations " + "(such as those relating to complex numbers) " + "may not be enabled when needed" + .format(knl_name=kernel.name)) + + if any(count_bug_per_dev): + if all(count_bug_per_dev): + work_around_arg_count_bug = True + else: + warn_about_arg_count_bug = True + + # }}} + + cl_arg_idx = 0 + arg_idx_to_cl_arg_idx = {} + + fp_arg_count = 0 + + from genpy import ( + Comment, Line, If, Raise, Assign, Statement as S, Suite) + + result = [] + gen = result.append + + for arg_idx, idi in enumerate(implemented_data_info): + arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx + + if idi.arg_class is not lp.ValueArg: + assert issubclass(idi.arg_class, ArrayBase) + + # assume each of those generates exactly one... + cl_arg_idx += 1 + + continue + + gen(Comment("{{{ process %s" % idi.name)) + gen(Line()) + + if not options.skip_arg_checks: + gen(If("%s is None" % idi.name, + Raise('RuntimeError("input argument \'{name}\' ' + 'must be supplied")'.format(name=idi.name)))) + + if sys.version_info < (2, 7) and idi.dtype.is_integral(): + gen(Comment("cast to long to avoid trouble with struct packing")) + gen(Assign(idi.name, "long(%s)" % idi.name)) + gen(Line()) + + if idi.dtype.is_composite(): + gen(S("_lpy_knl.set_arg(%d, %s)" % (cl_arg_idx, idi.name))) + cl_arg_idx += 1 + + elif idi.dtype.is_complex(): + assert isinstance(idi.dtype, NumpyType) + + dtype = idi.dtype + + if warn_about_arg_count_bug: + warn("{knl_name}: arguments include complex numbers, and " + "some (but not all) of the target devices mishandle " + "struct kernel arguments (hence the workaround is " + "disabled".format( + knl_name=kernel.name)) + + if dtype.numpy_dtype == np.complex64: + arg_char = "f" + elif dtype.numpy_dtype == np.complex128: + arg_char = "d" + else: + raise TypeError("unexpected complex type: %s" % dtype) + + if (work_around_arg_count_bug + and dtype.numpy_dtype == np.complex128 + and fp_arg_count + 2 <= 8): + gen(Assign( + "_lpy_buf", + "_lpy_pack('{arg_char}', {arg_var}.real)" + .format(arg_char=arg_char, arg_var=idi.name))) + gen(S( + "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)" + .format(cl_arg_idx=cl_arg_idx))) + cl_arg_idx += 1 + + gen(Assign( + "_lpy_buf", + "_lpy_pack('{arg_char}', {arg_var}.imag)" + .format(arg_char=arg_char, arg_var=idi.name))) + gen(S( + "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)" + .format(cl_arg_idx=cl_arg_idx))) + cl_arg_idx += 1 + else: + gen(Assign( + "_lpy_buf", + "_lpy_pack('{arg_char}{arg_char}', " + "{arg_var}.real, {arg_var}.imag)" + .format(arg_char=arg_char, arg_var=idi.name))) + gen(S( + "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)" + .format(cl_arg_idx=cl_arg_idx))) + cl_arg_idx += 1 + + fp_arg_count += 2 + + elif isinstance(idi.dtype, NumpyType): + if idi.dtype.dtype.kind == "f": + fp_arg_count += 1 + + gen(S( + "_lpy_knl.set_arg(%d, _lpy_pack('%s', %s))" + % (cl_arg_idx, idi.dtype.dtype.char, idi.name))) + + cl_arg_idx += 1 + + else: + raise LoopyError("do not know how to pass argument of type '%s'" + % idi.dtype) + + gen(Line()) + + gen(Comment("}}}")) + gen(Line()) + + return Suite(result), arg_idx_to_cl_arg_idx, cl_arg_idx + +# }}} + + +def generate_array_arg_setup(kernel, implemented_data_info, arg_idx_to_cl_arg_idx): + from loopy.kernel.array import ArrayBase + from genpy import Statement as S, Suite + + result = [] + gen = result.append + + for arg_idx, arg in enumerate(implemented_data_info): + if not issubclass(arg.arg_class, ArrayBase): + continue + + cl_arg_idx = arg_idx_to_cl_arg_idx[arg_idx] + + gen(S("_lpy_knl.set_arg(%d, %s)" % (cl_arg_idx, arg.name))) + + return Suite(result) + + +# {{{ host ast builder + +class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): + """A Python host AST builder for integration with PyOpenCL. + """ + + # {{{ code generation guts + + def get_function_definition(self, codegen_state, codegen_result, + schedule_index, function_decl, function_body): + args = ( + ["_lpy_cl_kernels", "queue"] + + [idi.name for idi in codegen_state.implemented_data_info] + + ["wait_for=None"]) + + from genpy import Function, Suite, ImportAs, Return, FromImport, Line + return Function( + codegen_result.current_program(codegen_state).name, + args, + Suite([ + FromImport("struct", ["pack as _lpy_pack"]), + ImportAs("pyopencl", "_lpy_cl"), + Line(), + function_body, + Return("_lpy_evt"), + ])) + + def get_function_declaration(self, codegen_state, codegen_result, + schedule_index): + # no such thing in Python + return None + + def get_temporary_decls(self, codegen_state): + # FIXME: Create global temporaries + return [] + + def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): + ecm = self.get_expression_to_code_mapper(codegen_state) + + if not gsize: + gsize = (1,) + if not lsize: + lsize = (1,) + + value_arg_code, arg_idx_to_cl_arg_idx, cl_arg_count = \ + generate_value_arg_setup(codegen_state.kernel, [self.target.device], + codegen_state.implemented_data_info) + arry_arg_code = generate_array_arg_setup( + codegen_state.kernel, + codegen_state.implemented_data_info, + arg_idx_to_cl_arg_idx) + + from genpy import Suite, Assign, Assert, Line, Comment + from pymbolic.mapper.stringifier import PREC_NONE + + # TODO: Generate finer-grained dependency structure + return Suite([ + Comment("{{{ enqueue %s" % name), + Line(), + Assign("_lpy_knl", "_lpy_cl_kernels."+name), + Assert("_lpy_knl.num_args == %d" % cl_arg_count), + Line(), + value_arg_code, + arry_arg_code, + Assign("_lpy_evt", "%(pyopencl_module_name)s.enqueue_nd_range_kernel(" + "queue, _lpy_knl, " + "%(gsize)s, %(lsize)s, wait_for=wait_for, g_times_l=True)" + % dict( + pyopencl_module_name=self.target.pyopencl_module_name, + gsize=ecm(gsize, prec=PREC_NONE, type_context="i"), + lsize=ecm(lsize, prec=PREC_NONE, type_context="i"))), + Assign("wait_for", "[_lpy_evt]"), + Line(), + Comment("}}}"), + Line(), + ]) + + # }}} + # }}} +# {{{ device ast builder + +class PyOpenCLCASTBuilder(OpenCLCASTBuilder): + """A C device AST builder for integration with PyOpenCL. + """ + + # {{{ library + + def function_manglers(self): + from loopy.library.random123 import random123_function_mangler + return ( + super(PyOpenCLCASTBuilder, self).function_manglers() + [ + pyopencl_function_mangler, + random123_function_mangler + ]) + + def preamble_generators(self): + from loopy.library.random123 import random123_preamble_generator + return ([ + pyopencl_preamble_generator, + random123_preamble_generator, + ] + super(PyOpenCLCASTBuilder, self).preamble_generators()) + + # }}} + +# }}} # vim: foldmethod=marker diff --git a/loopy/target/python.py b/loopy/target/python.py new file mode 100644 index 0000000000000000000000000000000000000000..83e8df12459dc0b79d0789341dfc1213c008e084 --- /dev/null +++ b/loopy/target/python.py @@ -0,0 +1,128 @@ +"""Python host AST builder for integration with PyOpenCL.""" + +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2016 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +from pymbolic.mapper.stringifier import StringifyMapper +from loopy.expression import TypeInferenceMapper +from loopy.kernel.data import ValueArg +from loopy.diagnostic import LoopyError +from loopy.target import ASTBuilderBase + + +# {{{ expression to code + +class ExpressionToPythonMapper(StringifyMapper): + def __init__(self, codegen_state, type_inf_mapper=None): + self.kernel = codegen_state.kernel + self.codegen_state = codegen_state + + if type_inf_mapper is None: + type_inf_mapper = TypeInferenceMapper(self.kernel) + self.type_inf_mapper = type_inf_mapper + + def rec(self, expr, prec, type_context=None, needed_dtype=None): + return super(ExpressionToPythonMapper, self).rec(expr, prec) + + __call__ = rec + + def map_constant(self, expr, enclosing_prec): + return repr(expr) + + def map_variable(self, expr, enclosing_prec): + if expr.name in self.kernel.all_inames(): + return super(ExpressionToPythonMapper, self).map_variable( + expr, enclosing_prec) + + var_descr = self.kernel.get_var_descriptor(expr.name) + if isinstance(var_descr, ValueArg): + return super(ExpressionToPythonMapper, self).map_variable( + expr, enclosing_prec) + + raise LoopyError("may not refer to %s '%s' in host code" + % (type(var_descr).__name__, expr.name)) + + def map_subscript(self, expr, enclosing_prec): + raise LoopyError("may not subscript '%s' in host code" + % expr.aggregate.name) + +# }}} + + +# {{{ ast builder + +class PythonASTBuilderBase(ASTBuilderBase): + """A Python host AST builder for integration with PyOpenCL. + """ + + # {{{ code generation guts + + def get_expression_to_code_mapper(self, codegen_state): + return ExpressionToPythonMapper(codegen_state) + + @property + def ast_block_class(self): + from genpy import Suite + return Suite + + def emit_sequential_loop(self, codegen_state, iname, iname_dtype, + static_lbound, static_ubound, inner): + ecm = codegen_state.expression_to_code_mapper + + from loopy.symbolic import aff_to_expr + + from pymbolic.mapper.stringifier import PREC_NONE + from genpy import For + + return For( + (iname,), + "range(%s, %s + 1)" + % ( + ecm(aff_to_expr(static_lbound), PREC_NONE, "i"), + ecm(aff_to_expr(static_ubound), PREC_NONE, "i"), + ), + inner) + + def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): + from genpy import Assign + return Assign(name, val_str) + + def emit_blank_line(self): + from genpy import Line + return Line() + + def emit_comment(self, s): + from genpy import Comment + return Comment(s) + + def emit_if(self, condition_str, ast): + from genpy import If + return If(condition_str, ast) + + # }}} + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/types.py b/loopy/types.py index d80650eb6b203c70b2abf60481ebb9e38ca99b9f..b897d9f700b198e73d95a09c7d459ed2d7f877b1 100644 --- a/loopy/types.py +++ b/loopy/types.py @@ -116,10 +116,6 @@ class NumpyType(LoopyType): self.dtype = self.target.get_or_register_dtype([name], NumpyType(dtype)) def with_target(self, target): - if (self.target is not None - and target is not self.target): - raise RuntimeError("target already set to different value") - return type(self)(self.dtype, target) def assert_has_target(self): diff --git a/loopy/version.py b/loopy/version.py index 7716feea32079bb122557223b2a1e970ff630ecc..b1b7927babd7a33aed91eb390c4ca2268ae0f204 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v26-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v27-islpy%s" % _islpy_version diff --git a/requirements.txt b/requirements.txt index 5dbc4b050f4dd688e038b994281e08e73ab3133b..c4dbe7a6de33a8a0b801bb94fcc8b506b790091c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,9 @@ -git+git://github.com/inducer/pytools -git+git://github.com/inducer/islpy -git+git://github.com/inducer/cgen -git+git://github.com/pyopencl/pyopencl -git+git://github.com/inducer/pymbolic +git+https://github.com/inducer/pytools.git +git+https://github.com/inducer/islpy.git +git+https://github.com/inducer/cgen.git +git+https://github.com/pyopencl/pyopencl.git +git+https://github.com/inducer/pymbolic.git +git+https://github.com/inducer/genpy.git hg+https://bitbucket.org/inducer/f2py diff --git a/setup.py b/setup.py index 30d6dfb636c4d5cea181da4a6b7e6df514adca9d..bd24612e93cef3b9e25748b4d157d0df4b7209d5 100644 --- a/setup.py +++ b/setup.py @@ -39,6 +39,7 @@ setup(name="loo.py", install_requires=[ "pytools>=2016.1", "pymbolic>=2016.2", + "genpy>=2016.1", "cgen>=2016.1", "islpy>=2016.1.2", "six>=1.8.0", diff --git a/test/test_loopy.py b/test/test_loopy.py index a61b6563ff254fdf6eaf6279158a97d20045a663..588cc9a2015a983d86803848bb5e4d84ec6ae678 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1740,18 +1740,24 @@ def test_slab_decomposition_does_not_double_execute(ctx_factory): outer_tag=outer_tag) knl = lp.set_loop_priority(knl, "i_outer") - a = cl.clrandom.rand(queue, 20, np.float32) + a = cl.array.empty(queue, 20, np.float32) + a.fill(17) a_ref = a.copy() a_knl = a.copy() - knl = lp.set_options(knl, "write_cl") + knl = lp.set_options(knl, write_cl=True) + print("TEST-----------------------------------------") knl(queue, a=a_knl) + print("REF-----------------------------------------") ref_knl(queue, a=a_ref) + print("DONE-----------------------------------------") - queue.finish() - + print("REF", a_ref) + print("KNL", a_knl) assert (a_ref == a_knl).get().all() + print("_________________________________") + def test_multiple_writes_to_local_temporary(): # Loopy would previously only handle barrier insertion correctly if exactly @@ -2391,10 +2397,12 @@ def test_ispc_target(occa_mode=False): knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) - print( - lp.generate_code( + codegen_result = lp.generate_code_v2( lp.get_one_scheduled_kernel( - lp.preprocess_kernel(knl)))[0]) + lp.preprocess_kernel(knl))) + + print(codegen_result.device_code()) + print(codegen_result.host_code()) def test_cuda_target(): @@ -2556,6 +2564,81 @@ def test_random123(ctx_factory, tp): assert (0 <= out).all() +def test_kernel_splitting(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{ [i]: 0<=i<n }", + """ + c[i] = a[i + 1] + out[i] = c[i] + """) + + knl = lp.add_and_infer_dtypes(knl, + {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) + + ref_knl = knl + + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + + # schedule + from loopy.preprocess import preprocess_kernel + knl = preprocess_kernel(knl) + + from loopy.schedule import get_one_scheduled_kernel + knl = get_one_scheduled_kernel(knl) + + # map schedule onto host or device + print(knl) + + cgr = lp.generate_code_v2(knl) + + assert len(cgr.device_programs) == 2 + + print(cgr.device_code()) + print(cgr.host_code()) + + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) + + +def test_kernel_splitting_with_loop(ctx_factory): + #ctx = ctx_factory() + + knl = lp.make_kernel( + "{ [i,k]: 0<=i<n and 0<=k<3 }", + """ + c[k,i] = a[k, i + 1] + out[k,i] = c[k,i] + """) + + knl = lp.add_and_infer_dtypes(knl, + {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) + + ref_knl = knl + + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + + # schedule + from loopy.preprocess import preprocess_kernel + knl = preprocess_kernel(knl) + + from loopy.schedule import get_one_scheduled_kernel + knl = get_one_scheduled_kernel(knl) + + # map schedule onto host or device + print(knl) + + cgr = lp.generate_code_v2(knl) + + assert len(cgr.device_programs) == 2 + + print(cgr.device_code()) + print(cgr.host_code()) + + # Doesn't yet work--not passing k + #lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])