diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 5e3d11396f7a2e5720607700eca1a481c17005e2..71a1c4b3f5972b383af07ec1b07ad5ca3cba020b 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -218,7 +218,7 @@ class CodeGenerationState(object): # {{{ copy helpers - def copy(self, kernel=None, + def copy(self, kernel=None, implemented_data_info=None, implemented_domain=None, implemented_predicates=frozenset(), var_subst_map=None, vectorization_info=None, is_generating_device_code=None, @@ -228,6 +228,9 @@ class CodeGenerationState(object): if kernel is None: kernel = self.kernel + if implemented_data_info is None: + implemented_data_info = self.implemented_data_info + if vectorization_info is False: vectorization_info = None @@ -245,7 +248,7 @@ class CodeGenerationState(object): return CodeGenerationState( kernel=kernel, - implemented_data_info=self.implemented_data_info, + implemented_data_info=implemented_data_info, implemented_domain=implemented_domain or self.implemented_domain, implemented_predicates=( implemented_predicates or self.implemented_predicates), @@ -407,7 +410,7 @@ def generate_code_v2(kernel): # {{{ examine arg list - from loopy.kernel.data import ValueArg, temp_var_scope + from loopy.kernel.data import ValueArg from loopy.kernel.array import ArrayBase implemented_data_info = [] @@ -432,13 +435,6 @@ def generate_code_v2(kernel): else: raise ValueError("argument type not understood: '%s'" % type(arg)) - for tv in six.itervalues(kernel.temporary_variables): - if tv.scope == temp_var_scope.GLOBAL: - implemented_data_info.extend( - tv.decl_info( - kernel.target, - index_dtype=kernel.index_dtype)) - allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): if var.dtype.involves_complex(): diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index e12fa7e3e5a9567673c00e958f181886eb1fd68c..4438958e53d176535b13262a56f0b30a95d9faf7 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -57,6 +57,39 @@ def get_admissible_conditional_inames_for(codegen_state, sched_index): return frozenset(result) +def synthesize_idis_for_extra_args(kernel, schedule_index): + """ + :returns: A list of :class:`loopy.codegen.ImplementedDataInfo` + """ + sched_item = kernel.schedule[schedule_index] + + from loopy.codegen import ImplementedDataInfo + from loopy.kernel.data import InameArg, temp_var_scope + + assert isinstance(sched_item, CallKernel) + + idis = [] + + for arg in sched_item.extra_args: + temporary = kernel.temporary_variables[arg] + assert temporary.scope == temp_var_scope.GLOBAL + idis.extend( + temporary.decl_info( + kernel.target, + index_dtype=kernel.index_dtype)) + + for iname in sched_item.extra_inames: + idis.append( + ImplementedDataInfo( + target=kernel.target, + name=iname, + dtype=kernel.index_dtype, + arg_class=InameArg, + is_written=False)) + + return idis + + def generate_code_for_sched_index(codegen_state, sched_index): kernel = codegen_state.kernel sched_item = kernel.schedule[sched_index] @@ -68,17 +101,18 @@ def generate_code_for_sched_index(codegen_state, sched_index): _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) assert past_end_i <= codegen_state.schedule_index_end - from loopy.codegen.tools import synthesize_idis_for_extra_args extra_args = synthesize_idis_for_extra_args(kernel, sched_index) new_codegen_state = codegen_state.copy( is_generating_device_code=True, gen_program_name=sched_item.kernel_name, - schedule_index_end=past_end_i-1) + schedule_index_end=past_end_i-1, + implemented_data_info=(codegen_state.implemented_data_info + + extra_args)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( - new_codegen_state, sched_index + 1, extra_args) + new_codegen_state, sched_index + 1) glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index)) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index f4d61eb2163bd7ea82cbfc4abdf730de24fdea1a..ad92fcefa7db5e20cbc44d0cf643070b8f4c1832 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -216,6 +216,7 @@ def merge_codegen_results(codegen_state, elements, collapse=True): .with_new_ast(codegen_state, ast) .copy( implemented_domains=implemented_domains, + implemented_data_info=codegen_state.implemented_data_info, **kwargs)) @@ -238,9 +239,7 @@ def wrap_in_if(codegen_state, condition_exprs, inner): # {{{ program generation top-level -def generate_host_or_device_program(codegen_state, - schedule_index, - extra_args=[]): +def generate_host_or_device_program(codegen_state, schedule_index): ast_builder = codegen_state.ast_builder temp_decls = ast_builder.get_temporary_decls(codegen_state) @@ -266,8 +265,7 @@ def generate_host_or_device_program(codegen_state, cur_prog = codegen_result.current_program(codegen_state) body_ast = cur_prog.ast fdecl_ast = ast_builder.get_function_declaration( - codegen_state, codegen_result, schedule_index, - extra_args) + codegen_state, codegen_result, schedule_index) fdef_ast = ast_builder.get_function_definition( codegen_state, codegen_result, diff --git a/loopy/codegen/tools.py b/loopy/codegen/tools.py deleted file mode 100644 index 0d779a49400a6e9ccda40c7511723f417135ab06..0000000000000000000000000000000000000000 --- a/loopy/codegen/tools.py +++ /dev/null @@ -1,51 +0,0 @@ -from __future__ import division, absolute_import, print_function - -__copyright__ = "Copyright (C) 2016 Matt Wala" - -__license__ = """ -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -""" - -from pytools import memoize - -@memoize -def synthesize_idis_for_extra_args(kernel, schedule_index): - """ - :returns: A list of :class:`loopy.codegen.ImplementedDataInfo` - """ - sched_item = kernel.schedule[schedule_index] - - from loopy.schedule import CallKernel - from loopy.codegen import ImplementedDataInfo - from loopy.kernel.data import InameArg, temp_var_scope - from loopy.types import NumpyType - import numpy as np - - assert isinstance(sched_item, CallKernel) - - idis = [] - for iname in sched_item.extra_args: - idis.append(ImplementedDataInfo( - target=kernel.target, - name=iname, - dtype=NumpyType(np.int32, kernel.target), - arg_class=InameArg, - is_written=False)) - - return idis diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 2851aff698ba4f6b76b16dfb38dba7b79f0f5b0e..89c9ffcdaaceb78778231101411e1f017f7c4f85 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -71,7 +71,7 @@ class RunInstruction(ScheduleItem): class CallKernel(BeginBlockItem): - hash_fields = __slots__ = ["kernel_name", "extra_args"] + hash_fields = __slots__ = ["kernel_name", "extra_args", "extra_inames"] class ReturnFromKernel(EndBlockItem): @@ -390,9 +390,10 @@ def dump_schedule(kernel, schedule): lines.append(indent + "ENDLOOP %s" % sched_item.iname) elif isinstance(sched_item, CallKernel): lines.append(indent + - "CALL KERNEL %s(extra_args=%s)" % ( + "CALL KERNEL %s(extra_args=%s, extra_inames=%s)" % ( sched_item.kernel_name, - sched_item.extra_args)) + sched_item.extra_args, + sched_item.extra_inames)) indent += " " elif isinstance(sched_item, ReturnFromKernel): indent = indent[:-4] diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py index ef3b30d7920d29b939e804a13dfa9b973a52e78b..bf6f67e218aa2ddea183c769f3cf70348b9aec67 100644 --- a/loopy/schedule/device_mapping.py +++ b/loopy/schedule/device_mapping.py @@ -31,16 +31,15 @@ def map_schedule_onto_host_or_device(kernel): from loopy.schedule import CallKernel, ReturnFromKernel new_schedule = ( [CallKernel(kernel_name=kernel.name, - extra_args=[])] + + extra_args=[], + extra_inames=[])] + list(kernel.schedule) + [ReturnFromKernel(kernel_name=kernel.name)]) - return kernel.copy(schedule=new_schedule) + kernel = kernel.copy(schedule=new_schedule) + else: + kernel = map_schedule_onto_host_or_device_impl(kernel) - # Split the schedule onto host or device. - kernel = map_schedule_onto_host_or_device_impl(kernel) - # Compute which temporaries and inames go into which kernel. - kernel = restore_and_save_temporaries(kernel) - return kernel + return restore_and_save_temporaries(kernel) def get_block_boundaries(schedule): @@ -359,6 +358,8 @@ def determine_temporaries_to_promote(kernel, temporaries, name_gen): # }}} +# {{{ Domain augmentation + def augment_domain_for_temporary_promotion( kernel, domain, promoted_temporary, mode, name_gen): """ @@ -417,6 +418,8 @@ def augment_domain_for_temporary_promotion( domain = domain_list.get_basic_set(0) return domain, hw_inames, dim_inames, iname_to_tag +# }}} + def restore_and_save_temporaries(kernel): """ @@ -436,16 +439,11 @@ def restore_and_save_temporaries(kernel): inter_kernel_temporaries |= filter_out_subscripts(live_in[idx]) call_count += 1 - if call_count == 1: - # A single call / return corresponds to a kernel which has not been - # split. - return kernel - name_gen = kernel.get_var_name_generator() new_temporaries = determine_temporaries_to_promote( kernel, inter_kernel_temporaries, name_gen) - # {{{ Insert loads and spills of new temporaries. + # {{{ Insert loads and spills of new temporaries new_schedule = [] new_instructions = [] @@ -498,6 +496,12 @@ def restore_and_save_temporaries(kernel): tvals_to_load = ((subkernel_uses - subkernel_globals) | tvals_to_spill) & live_in[start_idx] + # Add new arguments. + sched_item = sched_item.copy( + extra_args=sorted(subkernel_globals + | set(new_temporaries[tv].name + for tv in tvals_to_load | tvals_to_spill))) + # }}} # {{{ Add all the loads and spills. @@ -532,7 +536,7 @@ def restore_and_save_temporaries(kernel): from loopy.kernel.tools import DomainChanger tval_hw_inames = new_temporaries[tval].hw_inames dchg = DomainChanger(kernel, - frozenset(sched_item.extra_args + tval_hw_inames)) + frozenset(sched_item.extra_inames + tval_hw_inames)) domain = dchg.domain domain, hw_inames, dim_inames, itt = \ @@ -627,7 +631,7 @@ def map_schedule_onto_host_or_device_impl(kernel): # {{{ Inner mapper function - dummy_call = CallKernel(kernel_name="", extra_args=[]) + dummy_call = CallKernel(kernel_name="", extra_args=[], extra_inames=[]) dummy_return = ReturnFromKernel(kernel_name="") def inner_mapper(start_idx, end_idx, new_schedule): @@ -721,7 +725,7 @@ def map_schedule_onto_host_or_device_impl(kernel): last_kernel_name = kernel_name_gen() new_schedule[idx] = sched_item.copy( kernel_name=last_kernel_name, - extra_args=list(inames)) + extra_inames=list(inames)) elif isinstance(sched_item, ReturnFromKernel): new_schedule[idx] = sched_item.copy( kernel_name=last_kernel_name) @@ -731,4 +735,5 @@ def map_schedule_onto_host_or_device_impl(kernel): inames.pop() new_kernel = kernel.copy(schedule=new_schedule) + return new_kernel diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 81488d24f8aab45aaf4bd75c2e46520b3b5687ce..3ec3a50b11f72a2975ac4366d495326bfcb69b37 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -236,7 +236,7 @@ class DummyHostASTBuilder(ASTBuilderBase): return function_body def get_function_declaration(self, codegen_state, codegen_result, - schedule_index, extra_args): + schedule_index): return None def get_temporary_decls(self, codegen_state): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 69906bcbce6fe84b0cf3a5a3e234a4e4e6c5ead4..15156f2a13ca1cb60d8b64471852e253b59a8c2c 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -224,7 +224,7 @@ class CASTBuilder(ASTBuilderBase): return var_descr.get_arg_decl(self) def get_function_declaration(self, codegen_state, codegen_result, - schedule_index, extra_args): + schedule_index): from cgen import FunctionDeclaration, Value name = codegen_result.current_program(codegen_state).name @@ -234,8 +234,7 @@ class CASTBuilder(ASTBuilderBase): return FunctionDeclaration( Value("void", name), [self.idi_to_cgen_declarator(codegen_state.kernel, idi) - for idi in - codegen_state.implemented_data_info + extra_args]) + for idi in codegen_state.implemented_data_info]) def get_temporary_decls(self, codegen_state): from loopy.kernel.data import temp_var_scope diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index d63160e1c45c857fdd068c29f74d5e7562823388..9e9d652e1fa04066493ee14dd562b7bca1966766 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -227,9 +227,9 @@ class CUDACASTBuilder(CASTBuilder): # {{{ top-level codegen def get_function_declaration(self, codegen_state, codegen_result, - schedule_index, extra_args): + schedule_index): fdecl = super(CUDACASTBuilder, self).get_function_declaration( - codegen_state, codegen_result, schedule_index, extra_args) + codegen_state, codegen_result, schedule_index) from cgen.cuda import CudaGlobal, CudaLaunchBounds fdecl = CudaGlobal(fdecl) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 5c680d3de2dd44b204dca43208f694c01c479341..896ea9158223435e3bef933818fbf3bc51a424b4 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -219,8 +219,7 @@ class ISPCASTBuilder(CASTBuilder): # {{{ top-level codegen def get_function_declaration(self, codegen_state, codegen_result, - schedule_index, extra_args): - assert extra_args == [], "extra_args not yet implemented" + schedule_index): name = codegen_result.current_program(codegen_state).name from cgen import (FunctionDeclaration, Value) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 37e63577bbb26d256105ff7da2fd242604e82679..c8f3b6b9e9dbd827e11b35af7189cd0c49b5bba3 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -384,9 +384,9 @@ class OpenCLCASTBuilder(CASTBuilder): # {{{ top-level codegen def get_function_declaration(self, codegen_state, codegen_result, - schedule_index, extra_args): + schedule_index): fdecl = super(OpenCLCASTBuilder, self).get_function_declaration( - codegen_state, codegen_result, schedule_index, extra_args) + codegen_state, codegen_result, schedule_index) from cgen.opencl import CLKernel, CLRequiredWorkGroupSize fdecl = CLKernel(fdecl) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index b35de12727198761aa527a7f597d0eacd5e7d29c..c36d78c4d78098217f2005184c2a5bdde2f59918 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -578,17 +578,8 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): if not issubclass(idi.arg_class, TemporaryVariable)] + ["wait_for=None", "allocator=None"]) - ecm = self.get_expression_to_code_mapper(codegen_state) - - def alloc_nbytes(idi): - return idi.dtype.numpy_dtype.itemsize * ( - sum(astrd*(alen-1) - for alen, astrd in zip(idi.unvec_shape, idi.unvec_strides)) - + 1) - - from genpy import (Function, Suite, Import, ImportAs, Return, FromImport, - If, Assign, Line, Statement as S) - from pymbolic.mapper.stringifier import PREC_NONE + from genpy import (For, Function, Suite, Import, ImportAs, Return, + FromImport, If, Assign, Line, Statement as S) return Function( codegen_result.current_program(codegen_state).name, args, @@ -602,38 +593,54 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): "allocator", "_lpy_cl_tools.DeferredAllocator(queue.context)")), Line(), - ] + [ - - # allocate global temporaries - Assign(idi.name, "allocator(%s)" - % ecm(alloc_nbytes(idi), PREC_NONE, "i")) - for idi in codegen_result.implemented_data_info - if issubclass(idi.arg_class, TemporaryVariable) - ] + [ Line(), function_body, Line(), ] + [ - - # free global temporaries - S("%s.release()" % idi.name) - for idi in codegen_result.implemented_data_info - if issubclass(idi.arg_class, TemporaryVariable) - + For("_tv", "_global_temporaries", + # free global temporaries + S("_tv.release()")) ] + [ Line(), Return("_lpy_evt"), ])) def get_function_declaration(self, codegen_state, codegen_result, - schedule_index, extra_args): + schedule_index): # no such thing in Python return None def get_temporary_decls(self, codegen_state): - # FIXME: Create global temporaries - return [] + from genpy import Assign, Comment, Line + + def alloc_nbytes(tv): + return tv.dtype.numpy_dtype.itemsize + + from loopy.kernel.data import temp_var_scope + + global_temporaries = sorted( + (tv for tv in six.itervalues(codegen_state.kernel.temporary_variables) + if tv.scope == temp_var_scope.GLOBAL), + key=lambda tv: tv.name) + + from pymbolic.mapper.stringifier import PREC_NONE + ecm = self.get_expression_to_code_mapper(codegen_state) + + if not global_temporaries: + return [Assign("_global_temporaries", "[]"), Line()] + + return [ + Comment("{{{ allocate global temporaries"), + Line()] + [ + Assign(tv.name, "allocator(%s)" % + ecm(alloc_nbytes(tv), PREC_NONE, "i")) + for tv in global_temporaries] + [ + Assign("_global_temporaries", "[{tvs}]".format(tvs=", ".join( + tv.name for tv in global_temporaries)))] + [ + Line(), + Comment("}}}"), + Line()] def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): ecm = self.get_expression_to_code_mapper(codegen_state)