diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py index b317135dd2db7c186d658b695d8b78f02e305b12..9977417198b8e3c7b1069716b02ac74410400df1 100644 --- a/loopy/schedule/device_mapping.py +++ b/loopy/schedule/device_mapping.py @@ -93,6 +93,20 @@ def get_common_hw_inames(kernel, insn_ids): set.intersection, (get_hw_inames(kernel, id_to_insn[id]) for id in insn_ids)) + +def remove_illegal_loops_for_hw_tagged_inames_in_schedule(kernel): + from loopy.kernel.data import HardwareParallelTag + new_schedule = [] + + for item in kernel.schedule: + if isinstance(item, (EnterLoop, LeaveLoop)): + tag = kernel.iname_to_tag.get(item.iname) + if isinstance(tag, HardwareParallelTag): + continue + new_schedule.append(item) + + return kernel.copy(schedule=new_schedule) + # }}} @@ -421,11 +435,15 @@ def augment_domain_for_temporary_promotion( new_iname = name_gen("{name}_{mode}_dim_{dim}". format(name=orig_temporary.name, mode=mode, - dim=orig_dim + t_idx)) + dim=t_idx)) domain = domain.set_dim_name( isl.dim_type.set, orig_dim + t_idx, new_iname) - #from loopy.kernel.data import auto - #iname_to_tag[new_iname] = auto + if orig_temporary.is_local: + # If the temporary is tagged local, then loads / stores can be done + # in parallel. + from loopy.kernel.data import AutoFitLocalIndexTag + iname_to_tag[new_iname] = AutoFitLocalIndexTag() + dim_inames.append(new_iname) # Add size information. @@ -433,7 +451,7 @@ def augment_domain_for_temporary_promotion( domain &= aff[0].le_set(aff[new_iname]) size = orig_temporary.shape[t_idx] from loopy.symbolic import aff_from_expr - domain &= aff[new_iname].le_set(aff_from_expr(domain.space, size)) + domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, size)) hw_inames = [] @@ -632,17 +650,25 @@ def restore_and_save_temporaries(kernel): # }}} new_iname_to_tag.update(kernel.iname_to_tag) - new_temporary_variables = dict( + updated_temporary_variables = dict( (t.name, t.as_variable()) for t in new_temporaries.values()) - new_temporary_variables.update(kernel.temporary_variables) + updated_temporary_variables.update(kernel.temporary_variables) kernel = kernel.copy( iname_to_tag=new_iname_to_tag, - temporary_variables=new_temporary_variables, + temporary_variables=updated_temporary_variables, instructions=kernel.instructions + new_instructions, schedule=new_schedule ) + from loopy.kernel.tools import assign_automatic_axes + kernel = assign_automatic_axes(kernel) + + # Once assign_automatic_axes() does its job, loops in the schedule + # for newly hardware-tagged inames are no longer necessary (and in + # fact illegal), so remove them. + kernel = remove_illegal_loops_for_hw_tagged_inames_in_schedule(kernel) + return kernel diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index b570cf4ab7e4ed543213b73716b6271e61e6b69b..428f333f660c1d0965a50f0af1d7c2cd907395a2 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -615,7 +615,9 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): from genpy import Assign, Comment, Line def alloc_nbytes(tv): - return tv.dtype.numpy_dtype.itemsize + from six.moves import reduce + from operator import mul + return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1) from loopy.kernel.data import temp_var_scope diff --git a/test/test_loopy.py b/test/test_loopy.py index 26f4f2faf3b83a24932cad0ac3941696ed3e2636..4b6dc898f086ac6f33da60d367c3b333dfa1806d 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1136,6 +1136,46 @@ def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) +def test_kernel_splitting_with_loop_and_local_temporary(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{ [i,k]: 0<=i<n and 0<=k<3 }", + """ + <> t_local[i % 8,k] = i % 8 + c[k,i] = a[k,i+1] + out[k,i] = c[k,i] + t_local[i % 8,k] + """) + + knl = lp.add_and_infer_dtypes(knl, + {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) + + knl = lp.set_temporary_scope(knl, "t_local", "local") + + ref_knl = knl + + knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.0") + + # schedule + from loopy.preprocess import preprocess_kernel + knl = preprocess_kernel(knl) + + from loopy.schedule import get_one_scheduled_kernel + knl = get_one_scheduled_kernel(knl) + + # map schedule onto host or device + print(knl) + + cgr = lp.generate_code_v2(knl) + + assert len(cgr.device_programs) == 2 + + print(cgr.device_code()) + print(cgr.host_code()) + + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=8)) + + def test_global_temporary(ctx_factory): ctx = ctx_factory()