From 3c173a0adf64653429ad66ff6adb84b8af041323 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Tue, 11 Jun 2013 00:36:05 -0400 Subject: [PATCH] Add set_loop_priority, remove check_kernels, auto-schedule in auto_test --- doc/reference.rst | 4 +- loopy/__init__.py | 11 +++- loopy/auto_test.py | 30 ++++++---- loopy/check.py | 124 ++++++++++++++++----------------------- loopy/compiled.py | 14 ----- loopy/kernel/__init__.py | 73 ++++++++++++++--------- loopy/schedule.py | 16 +++-- test/test_dg.py | 9 +-- test/test_linalg.py | 62 +++++--------------- test/test_loopy.py | 48 ++++----------- test/test_nbody.py | 44 ++++++-------- test/test_sem_reagan.py | 32 ++++------ 12 files changed, 193 insertions(+), 274 deletions(-) diff --git a/doc/reference.rst b/doc/reference.rst index 975ef80a9..c8d041079 100644 --- a/doc/reference.rst +++ b/doc/reference.rst @@ -185,6 +185,8 @@ Wrangling inames .. autofunction:: remove_unused_inames +.. autofunction:: set_loop_priority + Dealing with Substitution Rules ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -238,8 +240,6 @@ Finishing up .. autofunction:: generate_loop_schedules -.. autofunction:: check_kernels - .. autofunction:: generate_code Automatic Testing diff --git a/loopy/__init__.py b/loopy/__init__.py index 0d868cb47..7851f8a32 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -70,7 +70,6 @@ from loopy.schedule import generate_loop_schedules from loopy.codegen import generate_code from loopy.compiled import CompiledKernel from loopy.auto_test import auto_test_vs_ref -from loopy.check import check_kernels __all__ = [ "auto", @@ -89,7 +88,7 @@ __all__ = [ "preprocess_kernel", "realize_reduction", "infer_unknown_types", "generate_loop_schedules", "generate_code", - "CompiledKernel", "auto_test_vs_ref", "check_kernels", + "CompiledKernel", "auto_test_vs_ref", "make_kernel", "split_iname", "join_inames", "tag_inames", "duplicate_inames", @@ -757,6 +756,14 @@ def remove_unused_inames(knl, inames=None): # }}} +# {{{ set loop priority + +def set_loop_priority(kernel, loop_priority): + return kernel.copy(loop_priority=loop_priority) + +# }}} + + # {{{ convenience: add_prefetch # {{{ process footprint_subscripts diff --git a/loopy/auto_test.py b/loopy/auto_test.py index ea172db2f..c28583695 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -325,14 +325,14 @@ def _enumerate_cl_devices_for_ref_test(): # {{{ main automatic testing entrypoint def auto_test_vs_ref( - ref_knl, ctx, kernel_gen, op_count=[], op_label=[], parameters={}, + ref_knl, ctx, test_knl, op_count=[], op_label=[], parameters={}, print_ref_code=False, print_code=True, warmup_rounds=2, code_op=None, dump_binary=False, codegen_kwargs={}, options=[], fills_entire_output=True, do_check=True, check_result=None ): - """Compare results of `ref_knl` to the kernels generated by the generator - `kernel_gen`. + """Compare results of `ref_knl` to the kernels generated by + scheduling *test_knl*. :arg check_result: a callable with :class:`numpy.ndarray` arguments *(result, reference_result)* returning a a tuple (class:`bool`, @@ -385,8 +385,7 @@ def auto_test_vs_ref( properties=cl.command_queue_properties.PROFILING_ENABLE) import loopy as lp - ref_kernel_gen = lp.generate_loop_schedules(ref_knl) - for knl in lp.check_kernels(ref_kernel_gen, parameters): + for knl in lp.generate_loop_schedules(ref_knl): ref_sched_kernel = knl break @@ -460,7 +459,22 @@ def auto_test_vs_ref( properties=cl.command_queue_properties.PROFILING_ENABLE) args = None - for i, kernel in enumerate(kernel_gen): + from loopy.kernel import LoopKernel + if not isinstance(test_knl, LoopKernel): + from warnings import warn + warn("Passing an iterable of kernels to auto_test_vs_ref " + "is deprecated--just pass the kernel instead. " + "Scheduling will be performed in auto_test_vs_ref.", + DeprecationWarning, stacklevel=2) + + test_kernels = test_knl + else: + if not test_knl.schedule: + test_kernels = lp.generate_loop_schedules(test_knl) + else: + test_kernels = [test_knl] + + for i, kernel in enumerate(test_kernels): compiled = CompiledKernel(ctx, kernel, options=options, codegen_kwargs=codegen_kwargs) @@ -575,8 +589,4 @@ def auto_test_vs_ref( # }}} -from pytools import MovedFunctionDeprecationWrapper - -auto_test_vs_seq = MovedFunctionDeprecationWrapper(auto_test_vs_ref) - # vim: foldmethod=marker diff --git a/loopy/check.py b/loopy/check.py index e2de6405e..ae1d2a925 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -33,6 +33,54 @@ logger = logging.getLogger(__name__) # {{{ sanity checks run during scheduling +def check_sizes(kernel): + import loopy as lp + + from loopy import LoopyAdvisory + + parameters = {} + for arg in kernel.args: + if isinstance(arg, lp.ValueArg) and arg.approximately is not None: + parameters[arg.name] = arg.approximately + + glens, llens = kernel.get_grid_sizes_as_exprs() + + if (max(len(glens), len(llens)) + > kernel.device.max_work_item_dimensions): + raise RuntimeError("too many work item dimensions") + + from pymbolic import evaluate + from pymbolic.mapper.evaluator import UnknownVariableError + try: + glens = evaluate(glens, parameters) + llens = evaluate(llens, parameters) + except UnknownVariableError, name: + from warnings import warn + warn("could not check axis bounds because no value " + "for variable '%s' was passed to check_kernels()" + % name, LoopyAdvisory) + else: + for i in range(len(llens)): + if llens[i] > kernel.device.max_work_item_sizes[i]: + raise RuntimeError("group axis %d too big" % i) + + from pytools import product + if product(llens) > kernel.device.max_work_group_size: + raise RuntimeError("work group too big") + + from pyopencl.characterize import usable_local_mem_size + if kernel.local_mem_use() > usable_local_mem_size(kernel.device): + raise RuntimeError(5, "using too much local memory") + + from loopy.kernel.data import ConstantArg + const_arg_count = sum( + 1 for arg in kernel.args + if isinstance(arg, ConstantArg)) + + if const_arg_count > kernel.device.max_constant_args: + raise RuntimeError("too many constant arguments") + + def check_for_unused_hw_axes_in_insns(kernel): group_size, local_size = kernel.get_grid_sizes_as_exprs() @@ -319,6 +367,7 @@ def run_automatic_checks(kernel): try: logger.info("sanity-check %s: start" % kernel.name) + check_sizes(kernel) check_for_orphaned_user_hardware_axes(kernel) check_for_double_use_of_hw_axes(kernel) check_for_unused_hw_axes_in_insns(kernel) @@ -419,79 +468,4 @@ def check_implemented_domains(kernel, implemented_domains, code=None): # }}} -# {{{ user-invoked checks - -def get_problems(kernel, parameters): - """ - :return: *(max_severity, list of (severity, msg))*, where *severity* - ranges from 1-5. '5' means 'will certainly not run'. - """ - msgs = [] - - def msg(severity, s): - msgs.append((severity, s)) - - glens, llens = kernel.get_grid_sizes_as_exprs() - - if (max(len(glens), len(llens)) - > kernel.device.max_work_item_dimensions): - msg(5, "too many work item dimensions") - - from pymbolic import evaluate - from pymbolic.mapper.evaluator import UnknownVariableError - try: - glens = evaluate(glens, parameters) - llens = evaluate(llens, parameters) - except UnknownVariableError, name: - msg(1, "could not check axis bounds because no value " - "for variable '%s' was passed to check_kernels()" - % name) - else: - for i in range(len(llens)): - if llens[i] > kernel.device.max_work_item_sizes[i]: - msg(5, "group axis %d too big" % i) - - from pytools import product - if product(llens) > kernel.device.max_work_group_size: - msg(5, "work group too big") - - import pyopencl as cl - from pyopencl.characterize import usable_local_mem_size - if kernel.local_mem_use() > usable_local_mem_size(kernel.device): - if kernel.device.local_mem_type == cl.device_local_mem_type.LOCAL: - msg(5, "using too much local memory") - else: - msg(4, "using more local memory than available--" - "possibly OK due to cache nature") - - from loopy.kernel.data import ConstantArg - const_arg_count = sum( - 1 for arg in kernel.args - if isinstance(arg, ConstantArg)) - - if const_arg_count > kernel.device.max_constant_args: - msg(5, "too many constant arguments") - - max_severity = 0 - for sev, msg in msgs: - max_severity = max(sev, max_severity) - return max_severity, msgs - - -def check_kernels(kernel_gen, parameters={}, kill_level_min=5, - warn_level_min=1): - for kernel in kernel_gen: - max_severity, msgs = get_problems(kernel, parameters) - - for severity, msg in msgs: - if severity >= warn_level_min: - from warnings import warn - from loopy import LoopyAdvisory - warn(msg, LoopyAdvisory) - - if max_severity < kill_level_min: - yield kernel - -# }}} - # vim: foldmethod=marker diff --git a/loopy/compiled.py b/loopy/compiled.py index ee8702335..e2746b381 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -266,20 +266,6 @@ class CompiledKernel: specific arguments. """ - import loopy as lp - - # {{{ do scheduling, if not yet done - - if not isinstance(kernel, lp.LoopKernel): - # someone threw us an iterable of kernels - - kernel = _get_kernel_from_iterable(kernel) - - # Whether we need to call check_kernels. Since we don't have parameter - # values now, we'll do that on first invocation. - - # }}} - self.context = context self.kernel = kernel self.codegen_kwargs = codegen_kwargs diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 46b79186c..66bb1f56d 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -87,33 +87,55 @@ class LoopKernel(Record): """These correspond more or less directly to arguments of :func:`loopy.make_kernel`. - :ivar device: :class:`pyopencl.Device` - :ivar domains: a list of :class:`islpy.BasicSet` instances - :ivar instructions: - :ivar args: - :ivar schedule: - :ivar name: - :ivar preambles: - :ivar preamble_generators: - :ivar assumptions: - :ivar local_sizes: - :ivar temporary_variables: - :ivar iname_to_tag: - :ivar function_manglers: - :ivar symbol_manglers: - - The following arguments are not user-facing: - - :ivar substitutions: a mapping from substitution names to + .. attribute:: device + + :class:`pyopencl.Device` + + .. attribute:: domains + + a list of :class:`islpy.BasicSet` instances + + .. attribute:: instructions + .. attribute:: args + .. attribute:: schedule + + *None* or a list of :class:`loopy.schedule.ScheduleItem` + + .. attribute:: name + .. attribute:: preambles + .. attribute:: preamble_generators + .. attribute:: assumptions + .. attribute:: local_sizes + .. attribute:: temporary_variables + .. attribute:: iname_to_tag + .. attribute:: function_manglers + .. attribute:: symbol_manglers + + .. attribute:: substitutions + + a mapping from substitution names to :class:`SubstitutionRule` objects - :ivar iname_slab_increments: a dictionary mapping inames to (lower_incr, + + .. attribute:: iname_slab_increments + + a dictionary mapping inames to (lower_incr, upper_incr) tuples that will be separated out in the execution to generate 'bulk' slabs with fewer conditionals. - :ivar applied_iname_rewrites: A list of past substitution dictionaries that + + .. attribute:: loop_priority + + A list of inames. The earlier in the list the iname occurs, the earlier + it will be scheduled. (This applies to inames with non-parallel + implementation tags.) + + .. attribute:: applied_iname_rewrites + + A list of past substitution dictionaries that were applied to the kernel. These are stored so that they may be repeated on expressions the user specifies later. - :ivar cache_manager: - :ivar isl_context: + + .. attribute:: cache_manager + .. attribute:: isl_context """ # {{{ constructor @@ -134,8 +156,8 @@ class LoopKernel(Record): ], symbol_manglers=[opencl_symbol_mangler], - # non-user-facing iname_slab_increments={}, + loop_priority=[], applied_iname_rewrites=[], cache_manager=None, index_dtype=np.int32, @@ -145,10 +167,6 @@ class LoopKernel(Record): # their grid sizes shouldn't change. This provides # a way to forward sub-kernel grid size requests. get_grid_sizes=None): - """ - :arg domain: a :class:`islpy.BasicSet`, or a string parseable to - a basic set by the isl. Example: "{[i,j]: 0<=i < 10 and 0<= j < 9}" - """ if cache_manager is None: from loopy.kernel.tools import SetOperationCacheManager @@ -235,6 +253,7 @@ class LoopKernel(Record): preamble_generators=preamble_generators, assumptions=assumptions, iname_slab_increments=iname_slab_increments, + loop_priority=loop_priority, temporary_variables=temporary_variables, local_sizes=local_sizes, iname_to_tag=iname_to_tag, diff --git a/loopy/schedule.py b/loopy/schedule.py index f1e06a37e..4d5c0314a 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -33,19 +33,23 @@ logger = logging.getLogger(__name__) # {{{ schedule items -class EnterLoop(Record): +class ScheduleItem(Record): + __slots__ = [] + + +class EnterLoop(ScheduleItem): __slots__ = ["iname"] -class LeaveLoop(Record): +class LeaveLoop(ScheduleItem): __slots__ = ["iname"] -class RunInstruction(Record): +class RunInstruction(ScheduleItem): __slots__ = ["insn_id"] -class Barrier(Record): +class Barrier(ScheduleItem): __slots__ = ["comment"] # }}} @@ -785,7 +789,9 @@ def insert_barriers(kernel, schedule, level=0): # {{{ main scheduling entrypoint -def generate_loop_schedules(kernel, loop_priority=[], debug_args={}): +def generate_loop_schedules(kernel, debug_args={}): + loop_priority = kernel.loop_priority + from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) diff --git a/test/test_dg.py b/test/test_dg.py index 1b54a8328..8de1c8eec 100644 --- a/test/test_dg.py +++ b/test/test_dg.py @@ -151,11 +151,8 @@ def test_dg_volume(ctx_factory): variant_simple_padding, variant_fancy_padding ]: - kernel_gen = lp.generate_loop_schedules(variant(knl)) - kernel_gen = lp.check_kernels(kernel_gen, parameters_dict) - lp.auto_test_vs_ref( - seq_knl, ctx, kernel_gen, parameters=parameters_dict, + seq_knl, ctx, variant(knl), parameters=parameters_dict, #codegen_kwargs=dict(with_annotation=True) ) @@ -225,10 +222,8 @@ def no_test_dg_surface(ctx_factory): for variant in [ variant_basic, ]: - kernel_gen = lp.generate_loop_schedules(variant(knl)) - kernel_gen = lp.check_kernels(kernel_gen, parameters_dict) - lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, parameters=parameters_dict) + lp.auto_test_vs_ref(seq_knl, ctx, variant(knl), parameters=parameters_dict) if __name__ == "__main__": diff --git a/test/test_linalg.py b/test/test_linalg.py index 9bf916221..c9605e5ee 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -106,10 +106,7 @@ def test_axpy(ctx_factory): for variant in [variant_cpu, variant_gpu]: #for variant in [ variant_gpu]: - kernel_gen = lp.generate_loop_schedules(variant(knl)) - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) - - lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, variant(knl), op_count=[np.dtype(dtype).itemsize*n*3/1e9], op_label=["GBytes"], parameters={"a": a, "b": b, "n": n}, check_result=check) @@ -141,10 +138,7 @@ def test_transpose(ctx_factory): outer_tag="g.1", inner_tag="l.0") knl = lp.add_prefetch(knl, 'a', ["i_inner", "j_inner"]) - kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, {}) - - lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[dtype.itemsize*n**2*2/1e9], op_label=["GByte"], parameters={}) @@ -181,10 +175,7 @@ def test_plain_matrix_mul(ctx_factory): knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ]) - kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, {}) - - lp.auto_test_vs_ref(ref_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[vec_size*2*n**3/1e9], op_label=["GFlops"], parameters={"n": n}, check_result=check) @@ -220,10 +211,7 @@ def test_variable_size_matrix_mul(ctx_factory): knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) - kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) - - lp.auto_test_vs_ref(ref_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={"n": n}) @@ -253,6 +241,7 @@ def test_rank_one(ctx_factory): def variant_1(knl): knl = lp.add_prefetch(knl, "a") knl = lp.add_prefetch(knl, "b") + knl = knl.set_loop_priority(knl, ["i", "j"]) return knl def variant_2(knl): @@ -261,6 +250,7 @@ def test_rank_one(ctx_factory): knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") + knl = knl.set_loop_priority(knl, ["i", "j"]) knl = lp.add_prefetch(knl, "a") knl = lp.add_prefetch(knl, "b") return knl @@ -299,11 +289,7 @@ def test_rank_one(ctx_factory): #for variant in [variant_1, variant_2, variant_3, variant_4]: for variant in [variant_4]: - kernel_gen = lp.generate_loop_schedules(variant(knl), - loop_priority=["i", "j"]) - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) - - lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, variant(knl), op_count=[np.dtype(dtype).itemsize*n**2/1e9], op_label=["GBytes"], parameters={"n": n}) @@ -340,10 +326,7 @@ def test_troublesome_premagma_fermi_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"]) - kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) - - lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={}) @@ -388,11 +371,9 @@ def test_intel_matrix_mul(ctx_factory): #knl = lp.add_prefetch(knl, 'b', # ["k_inner", ("j_inner_inner", "j_inner_outer"),]) - kernel_gen = lp.generate_loop_schedules(knl) #hints=["k_outer", "k_inner_outer", "k_inner_inner"] - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) - lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={}) @@ -434,10 +415,7 @@ def test_magma_fermi_matrix_mul(ctx_factory): #knl = lp.add_prefetch(knl, 'b', # ["k_inner", ("j_inner_inner", "j_inner_outer"),]) - kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) - - lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={}) @@ -470,10 +448,7 @@ def test_image_matrix_mul(ctx_factory): knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"]) - kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) - - lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={}, print_ref_code=True) @@ -510,10 +485,7 @@ def test_image_matrix_mul_ilp(ctx_factory): knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"]) - kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) - - lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={}) @@ -575,10 +547,7 @@ def test_fancy_matrix_mul(ctx_factory): knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"]) - kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) - - lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters=dict(n=n)) @@ -612,10 +581,7 @@ def test_small_batched_matvec(ctx_factory): knl = lp.split_arg_axis(knl, ("f", 0), pad_mult) knl = lp.add_padding(knl, "f", 0, align_bytes) - kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, dict(K=K)) - - lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[K*2*Np**2/1e9], op_label=["GFlops"], parameters=dict(K=K)) diff --git a/test/test_loopy.py b/test/test_loopy.py index 5875912ba..ba695a5df 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -114,8 +114,7 @@ def test_sized_and_complex_literals(ctx_factory): ], assumptions="n>=1") - lp.auto_test_vs_ref(knl, ctx, lp.generate_loop_schedules(knl), - parameters=dict(n=5)) + lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5)) def test_simple_side_effect(ctx_factory): @@ -130,7 +129,6 @@ def test_simple_side_effect(ctx_factory): ) kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen) for gen_knl in kernel_gen: print gen_knl @@ -168,7 +166,6 @@ def test_owed_barriers(ctx_factory): knl = lp.tag_inames(knl, dict(i="l.0")) kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen) for gen_knl in kernel_gen: compiled = lp.CompiledKernel(ctx, gen_knl) @@ -189,7 +186,6 @@ def test_wg_too_small(ctx_factory): knl = lp.tag_inames(knl, dict(i="l.0")) kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen) import pytest for gen_knl in kernel_gen: @@ -216,10 +212,7 @@ def test_join_inames(ctx_factory): knl = lp.add_prefetch(knl, "a", sweep_inames=["i", "j"]) knl = lp.join_inames(knl, ["a_dim_0", "a_dim_1"]) - kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen) - - lp.auto_test_vs_ref(ref_knl, ctx, kernel_gen) + lp.auto_test_vs_ref(ref_knl, ctx, knl) def test_divisibility_assumption(ctx_factory): @@ -245,10 +238,7 @@ def test_divisibility_assumption(ctx_factory): code = lp.generate_code(k) assert "if" not in code - kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen) - - lp.auto_test_vs_ref(ref_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 16**3}) @@ -267,7 +257,6 @@ def test_multi_cse(ctx_factory): knl = lp.add_prefetch(knl, "a", []) kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen) for gen_knl in kernel_gen: compiled = lp.CompiledKernel(ctx, gen_knl) @@ -302,6 +291,7 @@ def test_stencil(ctx_factory): knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"]) + knl = lp.set_loop_priority(knl, ["i_outer", "i_inner_0", "j_0"]) return knl def variant_2(knl): @@ -309,14 +299,11 @@ def test_stencil(ctx_factory): knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], fetch_bounding_box=True) + knl = lp.set_loop_priority(knl, ["i_outer", "i_inner_0", "j_0"]) return knl for variant in [variant_1, variant_2]: - kernel_gen = lp.generate_loop_schedules(variant(knl), - loop_priority=["i_outer", "i_inner_0", "j_0"]) - kernel_gen = lp.check_kernels(kernel_gen) - - lp.auto_test_vs_ref(ref_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), fills_entire_output=False, print_ref_code=False, op_count=[n*n], op_label=["cells"]) @@ -352,15 +339,12 @@ def test_stencil_with_overfetch(ctx_factory): slabs=(1, 1)) knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], fetch_bounding_box=True) + knl = lp.set_loop_priority(knl, ["i_outer", "i_inner_0", "j_0"]) return knl for variant in [variant_overfetch]: - kernel_gen = lp.generate_loop_schedules(variant(knl), - loop_priority=["i_outer", "i_inner_0", "j_0"]) - kernel_gen = lp.check_kernels(kernel_gen) - n = 200 - lp.auto_test_vs_ref(ref_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), fills_entire_output=False, print_ref_code=False, op_count=[n*n], parameters=dict(n=n), op_label=["cells"]) @@ -382,7 +366,6 @@ def test_eq_constraint(ctx_factory): knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen) for knl in kernel_gen: print lp.generate_code(knl) @@ -828,10 +811,7 @@ def test_equality_constraints(ctx_factory): #print knl #print knl.domains[0].detect_equalities() - kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) - - lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, knl, parameters=dict(n=n), print_ref_code=True) @@ -857,10 +837,7 @@ def test_stride(ctx_factory): seq_knl = knl - kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) - - lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, knl, parameters=dict(n=n), fills_entire_output=False) @@ -888,10 +865,7 @@ def test_domain_dependency_via_existentially_quantified_variable(ctx_factory): seq_knl = knl - kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) - - lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, knl, parameters=dict(n=n), ) diff --git a/test/test_nbody.py b/test/test_nbody.py index 71830b758..2d80c1383 100644 --- a/test/test_nbody.py +++ b/test/test_nbody.py @@ -23,15 +23,11 @@ THE SOFTWARE. """ - - import numpy as np -import pyopencl as cl import loopy as lp -from pyopencl.tools import pytest_generate_tests_for_pyopencl \ - as pytest_generate_tests - +from pyopencl.tools import ( # noqa + pytest_generate_tests_for_pyopencl as pytest_generate_tests) def test_nbody(ctx_factory): @@ -46,54 +42,50 @@ def test_nbody(ctx_factory): "pot[i] = sum_float32(j, if(i != j, invdist, 0))", ], [ - lp.GlobalArg("x", dtype, shape="N,3", order="C"), - lp.GlobalArg("pot", dtype, shape="N", order="C"), - lp.ValueArg("N", np.int32), - ], - name="nbody", assumptions="N>=1") + lp.GlobalArg("x", dtype, shape="N,3", order="C"), + lp.GlobalArg("pot", dtype, shape="N", order="C"), + lp.ValueArg("N", np.int32), + ], + name="nbody", assumptions="N>=1") seq_knl = knl def variant_1(knl): knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0", - slabs=(0,1)) - knl = lp.split_iname(knl, "j", 256, slabs=(0,1)) - return knl, [] + slabs=(0, 1)) + knl = lp.split_iname(knl, "j", 256, slabs=(0, 1)) + return knl def variant_cpu(knl): knl = lp.expand_subst(knl) knl = lp.split_iname(knl, "i", 1024, - outer_tag="g.0", slabs=(0,1)) + outer_tag="g.0", slabs=(0, 1)) knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None) - return knl, [] + return knl def variant_gpu(knl): knl = lp.expand_subst(knl) knl = lp.split_iname(knl, "i", 256, - outer_tag="g.0", inner_tag="l.0", slabs=(0,1)) - knl = lp.split_iname(knl, "j", 256, slabs=(0,1)) + outer_tag="g.0", inner_tag="l.0", slabs=(0, 1)) + knl = lp.split_iname(knl, "j", 256, slabs=(0, 1)) knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"], ["x_fetch_j", "x_fetch_k"]) knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None) knl = lp.tag_inames(knl, dict(x_fetch_k="unr")) - return knl, ["j_outer", "j_inner"] + knl = lp.set_loop_priority(knl, ["j_outer", "j_inner"]) + return knl n = 3000 for variant in [variant_1, variant_cpu, variant_gpu]: - variant_knl, loop_prio = variant(knl) - kernel_gen = lp.generate_loop_schedules(variant_knl, - loop_priority=loop_prio) - kernel_gen = lp.check_kernels(kernel_gen, dict(N=n)) + variant_knl = variant(knl) - lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, variant_knl, op_count=[n**2*1e-6], op_label=["M particle pairs"], parameters={"N": n}) - - if __name__ == "__main__": import sys if len(sys.argv) > 1: diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py index 8ff5bf8ac..819de845d 100644 --- a/test/test_sem_reagan.py +++ b/test/test_sem_reagan.py @@ -23,16 +23,11 @@ THE SOFTWARE. """ - - import numpy as np -import pyopencl as cl import loopy as lp -from pyopencl.tools import pytest_generate_tests_for_pyopencl \ - as pytest_generate_tests - - +from pyopencl.tools import ( # noqa + pytest_generate_tests_for_pyopencl as pytest_generate_tests) def test_tim2d(ctx_factory): @@ -64,14 +59,14 @@ def test_tim2d(ctx_factory): ], [ - lp.GlobalArg("u", dtype, shape=field_shape, order=order), - lp.GlobalArg("lap", dtype, shape=field_shape, order=order), - lp.GlobalArg("G", dtype, shape=(3,)+field_shape, order=order), - # lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order), - lp.GlobalArg("D", dtype, shape=(n, n), order=order), - # lp.ImageArg("D", dtype, shape=(n, n)), - lp.ValueArg("K", np.int32, approximately=1000), - ], + lp.GlobalArg("u", dtype, shape=field_shape, order=order), + lp.GlobalArg("lap", dtype, shape=field_shape, order=order), + lp.GlobalArg("G", dtype, shape=(3,)+field_shape, order=order), + # lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order), + lp.GlobalArg("D", dtype, shape=(n, n), order=order), + # lp.ImageArg("D", dtype, shape=(n, n)), + lp.ValueArg("K", np.int32, approximately=1000), + ], name="semlap2D", assumptions="K>=1") knl = lp.duplicate_inames(knl, "o", within="ur") @@ -103,18 +98,13 @@ def test_tim2d(ctx_factory): return knl for variant in [variant_orig]: - kernel_gen = lp.generate_loop_schedules(variant(knl)) - kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) - K = 1000 - lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, variant(knl), op_count=[K*(n*n*n*2*2 + n*n*2*3 + n**3 * 2*2)/1e9], op_label=["GFlops"], parameters={"K": K}) - - if __name__ == "__main__": import sys if len(sys.argv) > 1: -- GitLab