diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index b764615b202de73b89ff30e8af9505915c051443..0f8028e43f33f1406b08fe1337a0b046c574e8ec 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -39,7 +39,7 @@ from loopy.version import DATA_MODEL_VERSION from loopy.symbolic import CombineMapper from functools import reduce -from loopy.kernel.function_interface import CallableKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable from pytools import ProcessLogger, memoize_method @@ -442,10 +442,6 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, """ from loopy.kernel import KernelState - if kernel.schedule is None: - from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel, callables_table) - if kernel.state != KernelState.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") @@ -584,6 +580,40 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, return codegen_result +def diverge_callee_entrypoints(program): + from loopy.program import _get_callable_ids + from pytools import UniqueNameGenerator + callable_ids = _get_callable_ids(program.callables_table, + program.entrypoints) + + new_callables = {} + renames = {} + + vng = UniqueNameGenerator(list(six.iterkeys(program.callables_table))) + + for clbl_id in callable_ids & program.entrypoints: + renames[clbl_id] = vng(based_on=clbl_id) + + for name, clbl in six.iteritems(program.callables_table): + if isinstance(clbl, CallableKernel): + from loopy.program import ( + rename_resolved_functions_in_a_single_kernel) + knl = rename_resolved_functions_in_a_single_kernel( + clbl.subkernel, renames) + new_callables[name] = clbl.copy(subkernel=knl) + elif isinstance(clbl, ScalarCallable): + new_callables[name] = clbl + else: + raise NotImplementedError(type(clbl)) + + for clbl_id in callable_ids & program.entrypoints: + knl = new_callables[clbl_id].subkernel.copy(name=renames[clbl_id]) + new_callables[renames[clbl_id]] = new_callables[clbl_id].copy( + subkernel=knl) + + return program.copy(callables_table=new_callables) + + @memoize_method def generate_code_v2(program): """ @@ -610,9 +640,29 @@ def generate_code_v2(program): from loopy.type_inference import infer_unknown_types program = infer_unknown_types(program, expect_completion=True) + new_callables = {} + + for name, clbl in six.iteritems(program.callables_table): + if isinstance(clbl, CallableKernel): + from loopy.schedule import get_one_scheduled_kernel + knl = clbl.subkernel + if knl.schedule is None: + knl = get_one_scheduled_kernel( + knl, program.callables_table) + new_callables[name] = clbl.copy(subkernel=knl) + elif isinstance(clbl, ScalarCallable): + new_callables[name] = clbl + else: + raise NotImplementedError(type(clbl)) + + program = program.copy(callables_table=new_callables) + + program = diverge_callee_entrypoints(program) + host_programs = [] device_programs = [] device_preambles = [] + callee_fdecls = [] implemented_data_infos = [] for func_id, in_knl_callable in program.callables_table.items(): @@ -622,21 +672,21 @@ def generate_code_v2(program): # point. By diverge we should rename the callees in kernels. # 2. Then pass the callee versions by saying is_entrypoint=False cgr = generate_code_for_a_single_kernel(in_knl_callable.subkernel, - program.callables_table, program.target, True) + program.callables_table, program.target, func_id in + program.entrypoints) if func_id in program.entrypoints: host_programs.extend(cgr.host_programs) implemented_data_infos.append(cgr.implemented_data_info) else: - assert cgr.host_programs == [] + # FIXME: This assertion should be valid + # assert cgr.host_programs == [] assert len(cgr.device_programs) == 1 #FIXME: # if isinstance(callee_prog_ast, Collection): # for entry in callee_prog_ast.contents: # if isinstance(entry, FunctionBody): # callee_fdecls.append(entry.fdecl) - - device_programs.insert( - cgr.device_programs[0].ast.fdecl, 0) + callee_fdecls.append(cgr.device_programs[0].ast.fdecl) device_programs.extend(cgr.device_programs) device_preambles.extend(cgr.device_preambles) @@ -644,6 +694,11 @@ def generate_code_v2(program): device_preambles.extend(list(in_knl_callable.generate_preambles( program.target))) + # adding the callee fdecls to the device_programs + from cgen import Collection + device_programs = ([device_programs[0].copy( + ast=Collection(callee_fdecls+[device_programs[0].ast]))] + + device_programs[1:]) return CodeGenerationResult( host_programs=host_programs, device_programs=device_programs, diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index aa61ea3bc26c9d10d81af7b8b830657b0f3b09b6..475e6d1c87da6bd78c97e2b17a4f5f666fb199bd 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -315,8 +315,9 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): .build(options=program[entrypoint].options.cl_build_options)) cl_kernels = _Kernels() - for dp in codegen_result.device_programs: - setattr(cl_kernels, dp.name, getattr(cl_program, dp.name)) + for dp in program.entrypoints: + #FIXME: This will fail for barriers, use a better option here. + setattr(cl_kernels, dp, getattr(cl_program, dp)) return _KernelInfo( program=program,