From 64c8b0c74fbed28500a64e3d22bc4b11e1437895 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 21 May 2016 23:18:06 -0500 Subject: [PATCH] Make passing inames + private temporaries work. --- loopy/codegen/control.py | 7 +- loopy/codegen/result.py | 8 +- loopy/kernel/data.py | 4 + loopy/schedule/__init__.py | 7 +- loopy/schedule/device_mapping.py | 381 +++++++++++++++++-------------- loopy/target/c/__init__.py | 8 +- loopy/target/cuda.py | 4 +- loopy/target/ispc.py | 3 +- loopy/target/opencl.py | 4 +- loopy/target/pyopencl.py | 18 +- test/test_loopy.py | 23 +- 11 files changed, 259 insertions(+), 208 deletions(-) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 7b7309387..e12fa7e3e 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -68,6 +68,9 @@ def generate_code_for_sched_index(codegen_state, sched_index): _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) assert past_end_i <= codegen_state.schedule_index_end + from loopy.codegen.tools import synthesize_idis_for_extra_args + extra_args = synthesize_idis_for_extra_args(kernel, sched_index) + new_codegen_state = codegen_state.copy( is_generating_device_code=True, gen_program_name=sched_item.kernel_name, @@ -75,7 +78,7 @@ def generate_code_for_sched_index(codegen_state, sched_index): from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( - new_codegen_state, sched_index + 1) + new_codegen_state, sched_index + 1, extra_args) glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index)) @@ -87,7 +90,7 @@ def generate_code_for_sched_index(codegen_state, sched_index): codegen_state, sched_item.kernel_name, glob_grid, loc_grid, - ()), + extra_args), ]) elif isinstance(sched_item, EnterLoop): diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 0947d00cd..f4d61eb21 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -153,7 +153,6 @@ class CodeGenerationResult(Record): self.current_program(codegen_state).copy( ast=new_ast)) - # }}} @@ -239,7 +238,9 @@ def wrap_in_if(codegen_state, condition_exprs, inner): # {{{ program generation top-level -def generate_host_or_device_program(codegen_state, schedule_index): +def generate_host_or_device_program(codegen_state, + schedule_index, + extra_args=[]): ast_builder = codegen_state.ast_builder temp_decls = ast_builder.get_temporary_decls(codegen_state) @@ -265,7 +266,8 @@ def generate_host_or_device_program(codegen_state, schedule_index): cur_prog = codegen_result.current_program(codegen_state) body_ast = cur_prog.ast fdecl_ast = ast_builder.get_function_declaration( - codegen_state, codegen_result, schedule_index) + codegen_state, codegen_result, schedule_index, + extra_args) fdef_ast = ast_builder.get_function_definition( codegen_state, codegen_result, diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 9c399997d..26f4f64a6 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -262,6 +262,10 @@ class ValueArg(KernelArgument): return ast_builder.get_value_arg_decl(self.name, (), self.dtype, False) + +class InameArg(ValueArg): + pass + # }}} diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 132bf01e3..2851aff69 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -71,7 +71,7 @@ class RunInstruction(ScheduleItem): class CallKernel(BeginBlockItem): - hash_fields = __slots__ = ["kernel_name", "extra_inames", "extra_args"] + hash_fields = __slots__ = ["kernel_name", "extra_args"] class ReturnFromKernel(EndBlockItem): @@ -390,10 +390,9 @@ def dump_schedule(kernel, schedule): lines.append(indent + "ENDLOOP %s" % sched_item.iname) elif isinstance(sched_item, CallKernel): lines.append(indent + - "CALL KERNEL %s(extra_args=%s, extra_inames=%s)" % ( + "CALL KERNEL %s(extra_args=%s)" % ( sched_item.kernel_name, - sched_item.extra_args, - sched_item.extra_inames)) + sched_item.extra_args)) indent += " " elif isinstance(sched_item, ReturnFromKernel): indent = indent[:-4] diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py index 4cd5a8e8a..ef3b30d79 100644 --- a/loopy/schedule/device_mapping.py +++ b/loopy/schedule/device_mapping.py @@ -23,6 +23,7 @@ THE SOFTWARE. """ from loopy.diagnostic import LoopyError +from pytools import Record, memoize_method def map_schedule_onto_host_or_device(kernel): @@ -30,7 +31,6 @@ def map_schedule_onto_host_or_device(kernel): from loopy.schedule import CallKernel, ReturnFromKernel new_schedule = ( [CallKernel(kernel_name=kernel.name, - extra_inames=[], extra_args=[])] + list(kernel.schedule) + [ReturnFromKernel(kernel_name=kernel.name)]) @@ -88,7 +88,6 @@ def get_common_hw_inames(kernel, insn_ids): # {{{ Use / def analysis - def filter_out_subscripts(exprs): """ Remove subscripts from expressions in `exprs`. @@ -267,73 +266,55 @@ def compute_live_temporaries(kernel, schedule): return live_in, live_out -def restore_and_save_temporaries(kernel): - """ - Add code that loads / spills the temporaries in the kernel which are - live across sub-kernel calls. - """ - # Compute live temporaries. - live_in, live_out = compute_live_temporaries(kernel, kernel.schedule) - - # Create kernel variables based on live temporaries. - inter_kernel_temporaries = set() - from loopy.schedule import CallKernel, ReturnFromKernel, RunInstruction - - call_count = 0 - for idx, sched_item in enumerate(kernel.schedule): - if isinstance(sched_item, CallKernel): - inter_kernel_temporaries |= filter_out_subscripts(live_in[idx]) - call_count += 1 +# {{{ Temporary promotion analysis - if call_count == 1: - # XXX - # Single kernel call - needs no saves / restores - return kernel +class PromotedTemporary(Record): + """ + .. attribute:: name - def_lists, use_lists = get_def_and_use_lists_for_all_temporaries(kernel) + The name of the new temporary. - # {{{ Determine which temporaries need passing around. + .. attribute:: orig_temporary - new_temporaries = {} - name_gen = kernel.get_var_name_generator() - - from loopy.kernel.data import LocalIndexTag, temp_var_scope - from pytools import Record + The original temporary variable object. - class PromotedTemporary(Record): - """ - .. attribute:: name + .. attribute:: hw_inames - The name of the new temporary. + The common list of hw axes that define the original object. - .. attribute:: orig_temporary + .. attribute:: shape_prefix - The original temporary variable object. + A list of expressions, to be added in front of the shape + of the promoted temporary value + """ - .. attribute:: hw_inames + @memoize_method + def as_variable(self): + temporary = self.orig_temporary + from loopy.kernel.data import TemporaryVariable, temp_var_scope + return TemporaryVariable( + name=self.name, + dtype=temporary.dtype, + scope=temp_var_scope.GLOBAL, + shape=self.new_shape) - The common list of hw axes that define the original object. + @property + def new_shape(self): + return self.shape_prefix + self.orig_temporary.shape - .. attribute:: shape_prefix - A list of expressions, to be added in front of the shape - of the promoted temporary value - """ +def determine_temporaries_to_promote(kernel, temporaries, name_gen): + """ + :returns: A :class:`dict` mapping temporary names from `temporaries` to + :class:`PromotedTemporary` objects + """ + new_temporaries = {} - def as_variable(self): - temporary = self.orig_temporary - from loopy.kernel.data import TemporaryVariable - return TemporaryVariable( - name=self.name, - dtype=temporary.dtype, - scope=temp_var_scope.GLOBAL, - shape=self.new_shape) + def_lists, use_lists = get_def_and_use_lists_for_all_temporaries(kernel) - @property - def new_shape(self): - return self.shape_prefix + self.orig_temporary.shape + from loopy.kernel.data import LocalIndexTag, temp_var_scope - for temporary in inter_kernel_temporaries: + for temporary in temporaries: temporary = kernel.temporary_variables[temporary] if temporary.scope == temp_var_scope.GLOBAL: # Nothing to be done for global temporaries (I hope) @@ -350,7 +331,7 @@ def restore_and_save_temporaries(kernel): key=lambda iname: str(kernel.iname_to_tag[iname])) shape_prefix = [] - idx = 0 + backing_hw_inames = [] for iname in hw_inames: tag = kernel.iname_to_tag[iname] @@ -373,7 +354,96 @@ def restore_and_save_temporaries(kernel): hw_inames=backing_hw_inames) new_temporaries[temporary.name] = backing_temporary - # }}} + return new_temporaries + +# }}} + + +def augment_domain_for_temporary_promotion( + kernel, domain, promoted_temporary, mode, name_gen): + """ + Add new axes to the domain corresponding to the dimensions of + `promoted_temporary`. + """ + import islpy as isl + + orig_temporary = promoted_temporary.orig_temporary + orig_dim = domain.dim(isl.dim_type.set) + dims_to_insert = len(orig_temporary.shape) + + iname_to_tag = {} + + # Add dimension-dependent inames. + dim_inames = [] + + domain = domain.add(isl.dim_type.set, dims_to_insert) + for t_idx in range(len(orig_temporary.shape)): + new_iname = name_gen("{name}_{mode}_dim_{dim}". + format(name=orig_temporary.name, + mode=mode, + dim=orig_dim + t_idx)) + domain = domain.set_dim_name( + isl.dim_type.set, orig_dim + t_idx, new_iname) + #from loopy.kernel.data import auto + #iname_to_tag[new_iname] = auto + dim_inames.append(new_iname) + + # Add size information. + aff = isl.affs_from_space(domain.space) + domain &= aff[0].le_set(aff[new_iname]) + size = orig_temporary.shape[t_idx] + from loopy.symbolic import aff_from_expr + domain &= aff[new_iname].le_set(aff_from_expr(domain.space, size)) + + hw_inames = [] + + # Add hardware inames duplicates. + for t_idx, hw_iname in enumerate(promoted_temporary.hw_inames): + new_iname = name_gen("{name}_{mode}_hw_dim_{dim}". + format(name=orig_temporary.name, + mode=mode, + dim=t_idx)) + hw_inames.append(new_iname) + iname_to_tag[new_iname] = kernel.iname_to_tag[hw_iname] + + from loopy.isl_helpers import duplicate_axes + domain = duplicate_axes( + domain, promoted_temporary.hw_inames, hw_inames) + + # The operations on the domain above return a Set object, but the + # underlying domain should be expressible as a single BasicSet. + domain_list = domain.get_basic_set_list() + assert domain_list.n_basic_set() == 1 + domain = domain_list.get_basic_set(0) + return domain, hw_inames, dim_inames, iname_to_tag + + +def restore_and_save_temporaries(kernel): + """ + Add code that loads / spills the temporaries in the kernel which are + live across sub-kernel calls. + """ + # Compute live temporaries. + live_in, live_out = compute_live_temporaries(kernel, kernel.schedule) + + # Create kernel variables based on live temporaries. + inter_kernel_temporaries = set() + from loopy.schedule import CallKernel, ReturnFromKernel, RunInstruction + + call_count = 0 + for idx, sched_item in enumerate(kernel.schedule): + if isinstance(sched_item, CallKernel): + inter_kernel_temporaries |= filter_out_subscripts(live_in[idx]) + call_count += 1 + + if call_count == 1: + # A single call / return corresponds to a kernel which has not been + # split. + return kernel + + name_gen = kernel.get_var_name_generator() + new_temporaries = determine_temporaries_to_promote( + kernel, inter_kernel_temporaries, name_gen) # {{{ Insert loads and spills of new temporaries. @@ -397,6 +467,8 @@ def restore_and_save_temporaries(kernel): subkernel_epilog = [] subkernel_schedule = [] + # {{{ Determine what to load / spill + start_idx = idx idx += 1 @@ -414,6 +486,7 @@ def restore_and_save_temporaries(kernel): kernel, get_use_set(insn))) idx += 1 + from loopy.kernel.data import temp_var_scope # Filter out temporaries that are global. subkernel_globals = set( tval for tval in subkernel_defs | subkernel_uses @@ -425,64 +498,10 @@ def restore_and_save_temporaries(kernel): tvals_to_load = ((subkernel_uses - subkernel_globals) | tvals_to_spill) & live_in[start_idx] - # Add arguments. - new_schedule.append( - sched_item.copy(extra_args=sorted( - set(new_temporaries[tval].name - for tval in tvals_to_spill | tvals_to_load) - | subkernel_globals))) - - import islpy as isl + # }}} # {{{ Add all the loads and spills. - def augment_domain(tval, domain, mode_str): - temporary = new_temporaries[tval] - orig_size = domain.dim(isl.dim_type.set) - dims_to_insert = len(temporary.orig_temporary.shape) - # Add dimension-dependent inames. - dim_inames = [] - - domain = domain.add(isl.dim_type.set, dims_to_insert) - for t_idx in range(len(temporary.orig_temporary.shape)): - new_iname = name_gen("{name}.{mode}.dim_{dim}". - format(name=temporary.orig_temporary.name, - mode=mode_str, - dim=orig_size + t_idx)) - domain = domain.set_dim_name( - isl.dim_type.set, orig_size + t_idx, new_iname) - from loopy.kernel.data import auto - new_iname_to_tag[new_iname] = auto - dim_inames.append(new_iname) - # Add size information. - aff = isl.affs_from_space(domain.space) - domain &= aff[0].le_set(aff[new_iname]) - size = temporary.orig_temporary.shape[t_idx] - from loopy.symbolic import aff_from_expr - domain &= aff[new_iname].le_set(aff_from_expr(domain.space, size)) - - hw_inames = [] - - # Add hardware inames duplicates. - for t_idx, hw_iname in enumerate(temporary.hw_inames): - new_iname = name_gen("{name}.{mode}.hw_dim_{dim}". - format(name=temporary.orig_temporary.name, - mode=mode_str, - dim=t_idx)) - hw_inames.append(new_iname) - new_iname_to_tag[new_iname] = kernel.iname_to_tag[hw_iname] - - from loopy.isl_helpers import duplicate_axes - domain = duplicate_axes( - domain, temporary.hw_inames, hw_inames) - - # The operations on the domain above return a Set object, but the - # underlying domain should be expressible as a single BasicSet. - domain_list = domain.get_basic_set_list() - assert domain_list.n_basic_set() == 1 - domain = domain_list.get_basic_set(0) - return domain, hw_inames, dim_inames - def subscript_or_var(agg, subscript): from pymbolic.primitives import Subscript, Variable if len(subscript) == 0: @@ -492,73 +511,91 @@ def restore_and_save_temporaries(kernel): Variable(agg), tuple(map(Variable, subscript))) - from loopy.kernel.data import Assignment - # After loading local temporaries, we need to insert a barrier. - local_temporaries = set() - - from loopy.kernel.tools import DomainChanger - for tval in tvals_to_load: - tval_hw_inames = new_temporaries[tval].hw_inames - dchg = DomainChanger(kernel, - frozenset(sched_item.extra_inames + tval_hw_inames)) - domain = dchg.domain - - domain, hw_inames, dim_inames = augment_domain(tval, domain, "load") - kernel = dchg.get_kernel_with(domain) - - # Add a load instruction. - insn_id = name_gen("{name}.load".format(name=tval)) - - new_insn = Assignment( - subscript_or_var( - tval, dim_inames), - subscript_or_var( - new_temporaries[tval].name, hw_inames + dim_inames), - id=insn_id) - - new_instructions.append(new_insn) - subkernel_prolog.append(RunInstruction(insn_id=insn_id)) - if new_temporaries[tval].orig_temporary.is_local: - local_temporaries.add(new_temporaries[tval].name) - - if local_temporaries: - from loopy.schedule import Barrier - subkernel_prolog.append( - Barrier(kind="local", - comment="for loads of {0}".format( - ", ".join(sorted(local_temporaries))))) - - for tval in tvals_to_spill: - tval_hw_inames = new_temporaries[tval].hw_inames - dchg = DomainChanger(kernel, - frozenset(sched_item.extra_inames + tval_hw_inames)) - domain = dchg.domain - - domain, hw_inames, dim_inames = augment_domain(tval, domain, "spill") - kernel = dchg.get_kernel_with(domain) - - # Add a spill instruction. - insn_id = name_gen("{name}.spill".format(name=tval)) - - new_insn = Assignment( - subscript_or_var( - new_temporaries[tval].name, hw_inames + dim_inames), - subscript_or_var( - tval, dim_inames), - id=insn_id) - - new_instructions.append(new_insn) - subkernel_epilog.append(RunInstruction(insn_id=insn_id)) + def make_loop_nest(inames): + from loopy.schedule import EnterLoop, LeaveLoop + return ( + [EnterLoop(iname=iname) for iname in inames], + list(reversed([LeaveLoop(iname=iname) for iname in inames]))) + + def insert_loads_or_spills(tvals, mode): + assert mode in ["load", "spill"] + from loopy.kernel.data import Assignment + + local_temporaries = set() + + code_block = \ + subkernel_prolog if mode == "load" else subkernel_epilog + + new_kernel = kernel + + for tval in tvals: + from loopy.kernel.tools import DomainChanger + tval_hw_inames = new_temporaries[tval].hw_inames + dchg = DomainChanger(kernel, + frozenset(sched_item.extra_args + tval_hw_inames)) + domain = dchg.domain + + domain, hw_inames, dim_inames, itt = \ + augment_domain_for_temporary_promotion( + new_kernel, domain, new_temporaries[tval], mode, + name_gen) + new_iname_to_tag.update(itt) + + new_kernel = dchg.get_kernel_with(domain) + + # Add the load / spill instruction. + insn_id = name_gen("{name}.{mode}".format(name=tval, mode=mode)) + + args = ( + subscript_or_var( + tval, dim_inames), + subscript_or_var( + new_temporaries[tval].name, hw_inames + dim_inames)) + + if mode == "spill": + args = reversed(args) + + new_insn = Assignment(*args, id=insn_id) + + new_instructions.append(new_insn) + loop_begin, loop_end = make_loop_nest(dim_inames) + code_block.extend( + loop_begin + + [RunInstruction(insn_id=insn_id)] + + loop_end) + if new_temporaries[tval].orig_temporary.is_local: + local_temporaries.add(new_temporaries[tval].name) + + # After loading / before spilling local temporaries, we need to + # insert a barrier. + if local_temporaries: + from loopy.schedule import Barrier + if mode == "load": + subkernel_prolog.append( + Barrier(kind="local", + comment="for loads of {0}".format( + ", ".join(sorted(local_temporaries))))) + else: + subkernel_epilog.insert(0, + Barrier(kind="local", + comment="for spills of {0}".format( + ", ".join(sorted(local_temporaries))))) + return new_kernel + + kernel = insert_loads_or_spills(tvals_to_load, "load") + kernel = insert_loads_or_spills(tvals_to_spill, "spill") # }}} new_schedule.extend( + [sched_item] + subkernel_prolog + subkernel_schedule + - subkernel_epilog) + subkernel_epilog + + # ReturnFromKernel + [schedule[idx]]) # ReturnFromKernel - new_schedule.append(schedule[idx]) idx += 1 # }}} @@ -590,7 +627,7 @@ def map_schedule_onto_host_or_device_impl(kernel): # {{{ Inner mapper function - dummy_call = CallKernel(kernel_name="", extra_args=[], extra_inames=[]) + dummy_call = CallKernel(kernel_name="", extra_args=[]) dummy_return = ReturnFromKernel(kernel_name="") def inner_mapper(start_idx, end_idx, new_schedule): @@ -684,7 +721,7 @@ def map_schedule_onto_host_or_device_impl(kernel): last_kernel_name = kernel_name_gen() new_schedule[idx] = sched_item.copy( kernel_name=last_kernel_name, - extra_inames=list(inames)) + extra_args=list(inames)) elif isinstance(sched_item, ReturnFromKernel): new_schedule[idx] = sched_item.copy( kernel_name=last_kernel_name) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 6aca830d9..69906bcbc 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -203,11 +203,14 @@ class CASTBuilder(ASTBuilderBase): return FunctionBody(function_decl, function_body) def idi_to_cgen_declarator(self, kernel, idi): + from loopy.kernel.data import InameArg if (idi.offset_for_name is not None or idi.stride_for_name_and_axis is not None): assert not idi.is_written from cgen import Const return Const(POD(self, idi.dtype, idi.name)) + elif issubclass(idi.arg_class, InameArg): + return InameArg(idi.name, idi.dtype).get_arg_decl(self) else: name = idi.base_name or idi.name var_descr = kernel.get_var_descriptor(name) @@ -221,7 +224,7 @@ class CASTBuilder(ASTBuilderBase): return var_descr.get_arg_decl(self) def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + schedule_index, extra_args): from cgen import FunctionDeclaration, Value name = codegen_result.current_program(codegen_state).name @@ -231,7 +234,8 @@ class CASTBuilder(ASTBuilderBase): return FunctionDeclaration( Value("void", name), [self.idi_to_cgen_declarator(codegen_state.kernel, idi) - for idi in codegen_state.implemented_data_info]) + for idi in + codegen_state.implemented_data_info + extra_args]) def get_temporary_decls(self, codegen_state): from loopy.kernel.data import temp_var_scope diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 9e9d652e1..d63160e1c 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -227,9 +227,9 @@ class CUDACASTBuilder(CASTBuilder): # {{{ top-level codegen def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + schedule_index, extra_args): fdecl = super(CUDACASTBuilder, self).get_function_declaration( - codegen_state, codegen_result, schedule_index) + codegen_state, codegen_result, schedule_index, extra_args) from cgen.cuda import CudaGlobal, CudaLaunchBounds fdecl = CudaGlobal(fdecl) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 896ea9158..5c680d3de 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -219,7 +219,8 @@ class ISPCASTBuilder(CASTBuilder): # {{{ top-level codegen def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + schedule_index, extra_args): + assert extra_args == [], "extra_args not yet implemented" name = codegen_result.current_program(codegen_state).name from cgen import (FunctionDeclaration, Value) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index c8f3b6b9e..37e63577b 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -384,9 +384,9 @@ class OpenCLCASTBuilder(CASTBuilder): # {{{ top-level codegen def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + schedule_index, extra_args): fdecl = super(OpenCLCASTBuilder, self).get_function_declaration( - codegen_state, codegen_result, schedule_index) + codegen_state, codegen_result, schedule_index, extra_args) from cgen.opencl import CLKernel, CLRequiredWorkGroupSize fdecl = CLKernel(fdecl) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 72147daf8..b35de1272 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -436,7 +436,7 @@ def generate_value_arg_setup(kernel, devices, implemented_data_info): for arg_idx, idi in enumerate(implemented_data_info): arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx - if idi.arg_class is not lp.ValueArg: + if not issubclass(idi.arg_class, lp.ValueArg): assert issubclass(idi.arg_class, ArrayBase) # assume each of those generates exactly one... @@ -627,7 +627,7 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): ])) def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + schedule_index, extra_args): # no such thing in Python return None @@ -643,13 +643,17 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): if not lsize: lsize = (1,) + all_args = codegen_state.implemented_data_info + extra_args + value_arg_code, arg_idx_to_cl_arg_idx, cl_arg_count = \ - generate_value_arg_setup(codegen_state.kernel, [self.target.device], - codegen_state.implemented_data_info) + generate_value_arg_setup( + codegen_state.kernel, + [self.target.device], + all_args) arry_arg_code = generate_array_arg_setup( - codegen_state.kernel, - codegen_state.implemented_data_info, - arg_idx_to_cl_arg_idx) + codegen_state.kernel, + all_args, + arg_idx_to_cl_arg_idx) from genpy import Suite, Assign, Assert, Line, Comment from pymbolic.mapper.stringifier import PREC_NONE diff --git a/test/test_loopy.py b/test/test_loopy.py index 1d710bd9d..739a99b8b 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2602,7 +2602,7 @@ def test_kernel_splitting(ctx_factory): def test_kernel_splitting_with_loop(ctx_factory): - #ctx = ctx_factory() + ctx = ctx_factory() knl = lp.make_kernel( "{ [i,k]: 0<=i t_extra_dim[i,0,i] = i <> t_private = a[k,i+1] - <> t_local[k,i] = a[k,i+1] - c[k,i] = a[k,i+1] + t_extra_dim[i,0,i] - out[k,i] = c[k,i] + t_private + t_local[k,i] + c[k,i] = a[k,i+1] + out[k,i] = c[k,i] + t_private """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) + ref_knl = knl knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") @@ -2670,13 +2668,12 @@ def test_kernel_splitting_with_loop_and_temporaries(ctx_factory): cgr = lp.generate_code_v2(knl) - assert len(cgr.device_programs) == 3 + assert len(cgr.device_programs) == 2 print(cgr.device_code()) print(cgr.host_code()) - # Doesn't yet work--not passing k, temporaries - #lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) def test_global_temporary(ctx_factory): -- GitLab