From 9ca753fed36d7af22715b758ef86c2ae92c1839a Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Thu, 6 Feb 2014 17:59:44 -0600 Subject: [PATCH] Don't store the target CL device in the kernel object --- doc/reference.rst | 2 + doc/tutorial.rst | 31 ++++---- examples/hello-loopy.py | 2 +- loopy/__init__.py | 4 +- loopy/auto_test.py | 15 ++-- loopy/check.py | 23 +++--- loopy/codegen/__init__.py | 9 ++- loopy/compiled.py | 10 +-- loopy/kernel/__init__.py | 58 ++++++++++----- loopy/kernel/creation.py | 5 +- loopy/preprocess.py | 30 +++++--- loopy/schedule.py | 12 ++-- test/test_dg.py | 4 +- test/test_linalg.py | 28 ++++---- test/test_loopy.py | 147 ++++++++++++++++++-------------------- test/test_nbody.py | 20 +++--- test/test_sem_reagan.py | 20 +++--- 17 files changed, 236 insertions(+), 184 deletions(-) diff --git a/doc/reference.rst b/doc/reference.rst index 164a44b5e..fab13029c 100644 --- a/doc/reference.rst +++ b/doc/reference.rst @@ -426,6 +426,8 @@ Finishing up .. autofunction:: generate_loop_schedules +.. autofunction:: get_one_scheduled_kernel + .. autofunction:: generate_code Running diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 72c6c5b0c..7785a4259 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -52,7 +52,7 @@ one vector, doubles it, and writes it to another. .. doctest:: - >>> knl = lp.make_kernel(ctx.devices[0], + >>> knl = lp.make_kernel( ... "{ [i]: 0<=i<n }", ... "out[i] = 2*a[i]") @@ -80,9 +80,6 @@ The parts that you see here are the two main components of a loopy kernel: See :ref:`expression-syntax` for a full list of allowed constructs in the left- and right-hand side expression of an assignment. -Loopy also needs to know which OpenCL device to target. ``ctx.devices[0]`` -specifies the first device in our OpenCL context. - As you create and transform kernels, it's useful to know that you can always see loopy's view of a kernel by printing it. @@ -234,6 +231,8 @@ call :func:`loopy.generate_code`: .. doctest:: >>> typed_knl = lp.add_dtypes(knl, dict(a=np.float32)) + >>> typed_knl = lp.preprocess_kernel(typed_knl, device=ctx.devices[0]) + >>> typed_knl = lp.get_one_scheduled_kernel(typed_knl) >>> code, _ = lp.generate_code(typed_knl) >>> print code <BLANKLINE> @@ -257,7 +256,7 @@ argument: .. doctest:: >>> # WARNING: Incorrect. - >>> knl = lp.make_kernel(ctx.devices[0], + >>> knl = lp.make_kernel( ... "{ [i,j]: 0<=i,j<n }", ... """ ... out[j,i] = a[i,j] @@ -284,7 +283,7 @@ an explicit dependency: .. doctest:: >>> # WARNING: Incorrect. - >>> knl = lp.make_kernel(ctx.devices[0], + >>> knl = lp.make_kernel( ... "{ [i,j]: 0<=i,j<n }", ... """ ... out[j,i] = a[i,j] {id=transpose} @@ -384,7 +383,7 @@ with identical bounds, for the use of the transpose: .. doctest:: - >>> knl = lp.make_kernel(ctx.devices[0], + >>> knl = lp.make_kernel( ... "{ [i,j,ii,jj]: 0<=i,j,ii,jj<n }", ... """ ... out[j,i] = a[i,j] {id=transpose} @@ -429,7 +428,7 @@ zero-fill kernel? .. doctest:: - >>> knl = lp.make_kernel(ctx.devices[0], + >>> knl = lp.make_kernel( ... "{ [i,j]: 0<=i,j<n }", ... """ ... a[i,j] = 0 @@ -514,7 +513,7 @@ Consider this example: .. doctest:: - >>> knl = lp.make_kernel(ctx.devices[0], + >>> knl = lp.make_kernel( ... "{ [i]: 0<=i<n }", ... "a[i] = 0", assumptions="n>=0") >>> knl = lp.split_iname(knl, "i", 16) @@ -564,7 +563,7 @@ commonly called 'loop tiling': .. doctest:: - >>> knl = lp.make_kernel(ctx.devices[0], + >>> knl = lp.make_kernel( ... "{ [i,j]: 0<=i,j<n }", ... "out[i,j] = a[j,i]", ... assumptions="n mod 16 = 0 and n >= 1") @@ -604,7 +603,7 @@ loop's tag to ``"unr"``: .. doctest:: - >>> knl = lp.make_kernel(ctx.devices[0], + >>> knl = lp.make_kernel( ... "{ [i]: 0<=i<n }", ... "a[i] = 0", assumptions="n>=0 and n mod 4 = 0") >>> orig_knl = knl @@ -679,7 +678,7 @@ Let's try this out on our vector fill kernel by creating workgroups of size .. doctest:: - >>> knl = lp.make_kernel(ctx.devices[0], + >>> knl = lp.make_kernel( ... "{ [i]: 0<=i<n }", ... "a[i] = 0", assumptions="n>=0") >>> knl = lp.split_iname(knl, "i", 128, @@ -724,7 +723,7 @@ assumption: .. doctest:: - >>> knl = lp.make_kernel(ctx.devices[0], + >>> knl = lp.make_kernel( ... "{ [i]: 0<=i<n }", ... "a[i] = 0", assumptions="n>=0") >>> orig_knl = knl @@ -821,7 +820,7 @@ Attempting to create this kernel results in an error: .. doctest:: - >>> lp.make_kernel(ctx.devices[0], + >>> lp.make_kernel( ... "{ [i]: 0<=i<n }", ... """ ... out[i] = 5 @@ -848,7 +847,7 @@ be told in order for the error to disappear--note the *assumptions* argument: .. doctest:: - >>> knl = lp.make_kernel(ctx.devices[0], + >>> knl = lp.make_kernel( ... "{ [i]: 0<=i<n }", ... """ ... out[i] = 5 @@ -868,7 +867,7 @@ This kernel performs a simple transposition of an input matrix: .. doctest:: - >>> knl = lp.make_kernel(ctx.devices[0], + >>> knl = lp.make_kernel( ... "{ [i,j]: 0<=i,j<n }", ... """ ... out[j,i] = a[i,j] diff --git a/examples/hello-loopy.py b/examples/hello-loopy.py index c7c7ade30..efdbf1315 100644 --- a/examples/hello-loopy.py +++ b/examples/hello-loopy.py @@ -13,7 +13,7 @@ a = cl.array.arange(queue, n, dtype=np.float32) # create # ------ -knl = lp.make_kernel(ctx.devices[0], +knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]") diff --git a/loopy/__init__.py b/loopy/__init__.py index 51925f9b6..8934aebc6 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -60,7 +60,7 @@ from loopy.padding import (split_arg_axis, find_padding_multiple, add_padding) from loopy.preprocess import (preprocess_kernel, realize_reduction, infer_unknown_types) -from loopy.schedule import generate_loop_schedules +from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel from loopy.codegen import generate_code from loopy.compiled import CompiledKernel from loopy.options import Options @@ -94,7 +94,7 @@ __all__ = [ "infer_argument_dtypes", "add_and_infer_dtypes", "preprocess_kernel", "realize_reduction", "infer_unknown_types", - "generate_loop_schedules", + "generate_loop_schedules", "get_one_scheduled_kernel", "generate_code", "CompiledKernel", diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 62797d00a..46c21ccad 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -28,6 +28,7 @@ import numpy as np import pyopencl as cl import pyopencl.array as cl_array +import loopy as lp AUTO_TEST_SKIP_RUN = False @@ -36,7 +37,6 @@ import logging logger = logging.getLogger(__name__) - # {{{ create random argument arrays for testing def fill_rand(ary): @@ -398,8 +398,9 @@ def auto_test_vs_ref( ref_queue = cl.CommandQueue(ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) - import loopy as lp - for knl in lp.generate_loop_schedules(ref_knl): + pp_ref_knl = lp.preprocess_kernel(ref_knl, device=dev) + + for knl in lp.generate_loop_schedules(pp_ref_knl): ref_sched_kernel = knl break @@ -487,6 +488,12 @@ def auto_test_vs_ref( test_kernels = test_knl else: + from loopy.kernel import kernel_state + if test_knl.state not in [ + kernel_state.PREPROCESSED, + kernel_state.SCHEDULED]: + test_knl = lp.preprocess_kernel(test_knl, device=ctx.devices[0]) + if not test_knl.schedule: test_kernels = lp.generate_loop_schedules(test_knl) else: @@ -604,7 +611,7 @@ def auto_test_vs_ref( print("elapsed: %g s event, %s s marker-event %g s wall " "(%d rounds)%s" % ( - elapsed, elapsed_evt_2, elapsed_wall, timing_rounds, rates)) + elapsed, elapsed_evt_2, elapsed_wall, timing_rounds, rates)) if do_check: ref_rates = "" diff --git a/loopy/check.py b/loopy/check.py index 70d16c2d3..df8a61b18 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -373,7 +373,7 @@ def pre_schedule_checks(kernel): # {{{ pre-code-generation checks -def check_sizes(kernel): +def check_sizes(kernel, device): import loopy as lp from loopy.diagnostic import LoopyAdvisory @@ -386,7 +386,7 @@ def check_sizes(kernel): glens, llens = kernel.get_grid_sizes_as_exprs() if (max(len(glens), len(llens)) - > kernel.device.max_work_item_dimensions): + > device.max_work_item_dimensions): raise LoopyError("too many work item dimensions") from pymbolic import evaluate @@ -401,15 +401,15 @@ def check_sizes(kernel): % name, LoopyAdvisory) else: for i in range(len(llens)): - if llens[i] > kernel.device.max_work_item_sizes[i]: + if llens[i] > device.max_work_item_sizes[i]: raise LoopyError("group axis %d too big" % i) from pytools import product - if product(llens) > kernel.device.max_work_group_size: + if product(llens) > device.max_work_group_size: raise LoopyError("work group too big") from pyopencl.characterize import usable_local_mem_size - if kernel.local_mem_use() > usable_local_mem_size(kernel.device): + if kernel.local_mem_use() > usable_local_mem_size(device): raise LoopyError("using too much local memory") from loopy.kernel.data import ConstantArg @@ -417,7 +417,7 @@ def check_sizes(kernel): 1 for arg in kernel.args if isinstance(arg, ConstantArg)) - if const_arg_count > kernel.device.max_constant_args: + if const_arg_count > device.max_constant_args: raise LoopyError("too many constant arguments") @@ -457,11 +457,18 @@ def check_that_shapes_and_strides_are_arguments(kernel): arg.name, ", ".join(deps-integer_arg_names))) -def pre_codegen_checks(kernel): +def pre_codegen_checks(kernel, device=None): try: logger.info("pre-codegen check %s: start" % kernel.name) - check_sizes(kernel) + if device is not None: + check_sizes(kernel, device) + else: + from loopy.diagnostic import warn + warn(kernel, "no_device_in_pre_codegen_checks", + "No device parameter was passed to loopy.pre_codegen_checks. " + "Perhaps you want to pass a device argument to generate_code.") + check_that_shapes_and_strides_are_arguments(kernel) logger.info("pre-codegen check %s: done" % kernel.name) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index afd8ed762..c33f336cb 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -23,6 +23,7 @@ THE SOFTWARE. """ +from loopy.diagnostic import LoopyError from pytools import Record import islpy as isl @@ -287,16 +288,20 @@ class ImplementedDataInfo(Record): # {{{ main code generation entrypoint -def generate_code(kernel): +def generate_code(kernel, device=None): if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) + from loopy.kernel import kernel_state + if kernel.state != kernel_state.SCHEDULED: + raise LoopyError("cannot generate code for a kernel that has not been " + "scheduled") from loopy.preprocess import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.check import pre_codegen_checks - pre_codegen_checks(kernel) + pre_codegen_checks(kernel, device=device) from cgen import (FunctionBody, FunctionDeclaration, Value, Module, Block, diff --git a/loopy/compiled.py b/loopy/compiled.py index 47bcac8b8..bfed0c5c8 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -632,7 +632,7 @@ class _CLKernelInfo(Record): class CompiledKernel: - def __init__(self, context, kernel, codegen_kwargs={}): + def __init__(self, context, kernel): """ :arg kernel: may be a loopy.LoopKernel, a generator returning kernels (a warning will be issued if more than one is returned). If the @@ -641,7 +641,6 @@ class CompiledKernel: """ self.context = context - self.codegen_kwargs = codegen_kwargs self.kernel = kernel self.packing_controller = SeparateArrayPackingController(kernel) @@ -676,6 +675,9 @@ class CompiledKernel: kernel = infer_unknown_types(kernel, expect_completion=True) if kernel.schedule is None: + from loopy.preprocess import preprocess_kernel + kernel = preprocess_kernel(kernel, self.context.devices[0]) + from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) @@ -686,7 +688,7 @@ class CompiledKernel: kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) from loopy.codegen import generate_code - code, impl_arg_info = generate_code(kernel, **self.codegen_kwargs) + code, impl_arg_info = generate_code(kernel, device=self.context.devices[0]) if self.kernel.options.write_cl: output = code @@ -724,7 +726,7 @@ class CompiledKernel: kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype) from loopy.codegen import generate_code - code, arg_info = generate_code(kernel, **self.codegen_kwargs) + code, arg_info = generate_code(kernel, device=self.context.devices[0]) return code def get_highlighted_code(self, arg_to_dtype=None): diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index c6f3abc14..81a64484c 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -26,12 +26,12 @@ THE SOFTWARE. import numpy as np -from pytools import Record, memoize_method +from pytools import RecordWithoutPickling, memoize_method import islpy as isl from islpy import dim_type import re -from pytools import UniqueNameGenerator, generate_unique_possibilities +from pytools import UniqueNameGenerator, generate_unique_names from loopy.library.function import ( default_function_mangler, @@ -79,13 +79,15 @@ class _UniqueVarNameGenerator(UniqueNameGenerator): # {{{ loop kernel object -class LoopKernel(Record): - """These correspond more or less directly to arguments of - :func:`loopy.make_kernel`. +class kernel_state: + INITIAL = 0 + PREPROCESSED = 1 + SCHEDULED = 2 - .. attribute:: device - :class:`pyopencl.Device` +class LoopKernel(RecordWithoutPickling): + """These correspond more or less directly to arguments of + :func:`loopy.make_kernel`. .. attribute:: domains @@ -137,11 +139,15 @@ class LoopKernel(Record): .. attribute:: options An instance of :class:`loopy.Options` + + .. attribute:: state + + A value from :class:`kernel_state`. """ # {{{ constructor - def __init__(self, device, domains, instructions, args=[], schedule=None, + def __init__(self, domains, instructions, args=[], schedule=None, name="loopy_kernel", preambles=[], preamble_generators=[default_preamble_generator], @@ -167,6 +173,8 @@ class LoopKernel(Record): isl_context=None, options=None, + state=kernel_state.INITIAL, + # When kernels get intersected in slab decomposition, # their grid sizes shouldn't change. This provides # a way to forward sub-kernel grid size requests. @@ -247,8 +255,15 @@ class LoopKernel(Record): # overwrites method down below self.get_grid_sizes = get_grid_sizes - Record.__init__(self, - device=device, domains=domains, + if state not in [ + kernel_state.INITIAL, + kernel_state.PREPROCESSED, + kernel_state.SCHEDULED, + ]: + raise ValueError("invalid value for 'state'") + + RecordWithoutPickling.__init__(self, + domains=domains, instructions=instructions, args=args, schedule=schedule, @@ -269,7 +284,8 @@ class LoopKernel(Record): symbol_manglers=symbol_manglers, index_dtype=index_dtype, isl_context=isl_context, - options=options) + options=options, + state=state) # }}} @@ -310,7 +326,7 @@ class LoopKernel(Record): used_ids = set(insn.id for insn in insns) | extra_used_ids - for id_str in generate_unique_possibilities(based_on): + for id_str in generate_unique_names(based_on): if id_str not in used_ids: return id_str @@ -745,7 +761,7 @@ class LoopKernel(Record): dom_intersect_assumptions, iname_idx) .coalesce()) - class BoundsRecord(Record): + class BoundsRecord(RecordWithoutPickling): pass size = (upper_bound_pw_aff - lower_bound_pw_aff + 1) @@ -814,8 +830,6 @@ class LoopKernel(Record): tgt_dict[tag.axis] = size - max_dims = self.device.max_work_item_dimensions - def to_dim_tuple(size_dict, which, forced_sizes={}): forced_sizes = forced_sizes.copy() @@ -840,10 +854,6 @@ class LoopKernel(Record): size_list.append(size_dict[cur_axis]) - if len(size_list) > max_dims: - raise ValueError("more %s dimensions assigned than supported " - "by hardware (%d > %d)" % (which, len(size_list), max_dims)) - return tuple(size_list) return (to_dim_tuple(global_sizes, "global"), @@ -1016,6 +1026,16 @@ class LoopKernel(Record): # }}} + def __getinitargs__(self): + result = dict( + (key, getattr(self, key)) + for key in self.__class__.fields + if hasattr(self, key)) + + result.pop("cache_manager", None) + + return result + # }}} # vim: foldmethod=marker diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 1d5c3d668..e3f7ab14e 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -935,10 +935,9 @@ def resolve_wildcard_deps(knl): # {{{ kernel creation top-level -def make_kernel(device, domains, instructions, kernel_data=["..."], **kwargs): +def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): """User-facing kernel creation entrypoint. - :arg device: :class:`pyopencl.Device` :arg domains: :class:`islpy.BasicSet` :arg instructions: :arg kernel_data: @@ -1101,7 +1100,7 @@ def make_kernel(device, domains, instructions, kernel_data=["..."], **kwargs): kernel_args = arg_guesser.guess_kernel_args_if_requested(kernel_args) from loopy.kernel import LoopKernel - knl = LoopKernel(device, domains, instructions, kernel_args, + knl = LoopKernel(domains, instructions, kernel_args, temporary_variables=temporary_variables, silenced_warnings=silenced_warnings, options=options, diff --git a/loopy/preprocess.py b/loopy/preprocess.py index ec1e750d3..60643f559 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -985,12 +985,12 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): # {{{ temp storage adjust for bank conflict -def adjust_local_temp_var_storage(kernel): +def adjust_local_temp_var_storage(kernel, device): logger.debug("%s: adjust temp var storage" % kernel.name) new_temp_vars = {} - lmem_size = cl_char.usable_local_mem_size(kernel.device) + lmem_size = cl_char.usable_local_mem_size(device) for temp_var in kernel.temporary_variables.itervalues(): if not temp_var.is_local: new_temp_vars[temp_var.name] = \ @@ -1013,12 +1013,12 @@ def adjust_local_temp_var_storage(kernel): # below to avoid bank conflicts from pytools import product - if kernel.device.local_mem_type == cl.device_local_mem_type.GLOBAL: + if device.local_mem_type == cl.device_local_mem_type.GLOBAL: # FIXME: could try to avoid cache associativity disasters new_storage_shape = storage_shape - elif kernel.device.local_mem_type == cl.device_local_mem_type.LOCAL: - min_mult = cl_char.local_memory_bank_count(kernel.device) + elif device.local_mem_type == cl.device_local_mem_type.LOCAL: + min_mult = cl_char.local_memory_bank_count(device) good_incr = None new_storage_shape = storage_shape min_why_not = None @@ -1028,7 +1028,7 @@ def adjust_local_temp_var_storage(kernel): test_storage_shape = storage_shape[:] test_storage_shape[-1] = test_storage_shape[-1] + increment new_mult, why_not = cl_char.why_not_local_access_conflict_free( - kernel.device, temp_var.dtype.itemsize, + device, temp_var.dtype.itemsize, temp_var.shape, test_storage_shape) # will choose smallest increment 'automatically' @@ -1062,7 +1062,12 @@ def adjust_local_temp_var_storage(kernel): # }}} -def preprocess_kernel(kernel): +def preprocess_kernel(kernel, device=None): + from loopy.kernel import kernel_state + if kernel.state != kernel_state.INITIAL: + raise LoopyError("cannot re-preprocess an already preprocessed " + "kernel") + logger.info("%s: preprocess start" % kernel.name) from loopy.subst import expand_subst @@ -1096,11 +1101,18 @@ def preprocess_kernel(kernel): kernel = assign_automatic_axes(kernel) kernel = find_boostability(kernel) kernel = limit_boostability(kernel) - kernel = adjust_local_temp_var_storage(kernel) + + if device is not None: + kernel = adjust_local_temp_var_storage(kernel, device) + else: + from loopy.diagnostic import warn + warn(kernel, "no_device_in_preprocess", + "no device parameter was passed to loopy.preprocess") logger.info("%s: preprocess done" % kernel.name) - return kernel + return kernel.copy( + state=kernel_state.PREPROCESSED) diff --git a/loopy/schedule.py b/loopy/schedule.py index 697322f11..419672a96 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -999,10 +999,12 @@ def insert_barriers(kernel, schedule, reverse, kind, level=0): # {{{ main scheduling entrypoint def generate_loop_schedules(kernel, debug_args={}): - loop_priority = kernel.loop_priority + from loopy.kernel import kernel_state + if kernel.state != kernel_state.PREPROCESSED: + raise LoopyError("cannot schedule a kernel that has not been " + "preprocessed") - from loopy.preprocess import preprocess_kernel - kernel = preprocess_kernel(kernel) + loop_priority = kernel.loop_priority from loopy.check import pre_schedule_checks pre_schedule_checks(kernel) @@ -1049,7 +1051,9 @@ def generate_loop_schedules(kernel, debug_args={}): reverse=False, kind="local") debug.stop() - yield kernel.copy(schedule=gen_sched) + yield kernel.copy( + schedule=gen_sched, + state=kernel_state.SCHEDULED) debug.start() schedule_count += 1 diff --git a/test/test_dg.py b/test/test_dg.py index 956bee2d7..291da4484 100644 --- a/test/test_dg.py +++ b/test/test_dg.py @@ -47,7 +47,7 @@ def test_dg_volume(ctx_factory): K = 10000 - knl = lp.make_kernel(ctx.devices[0], [ + knl = lp.make_kernel([ "{[n,m,k]: 0<= n,m < Np and 0<= k < K}", ], """ @@ -175,7 +175,7 @@ def no_test_dg_surface(ctx_factory): K = 10000 - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( [ "{[m,n,k]: 0<= m < NfpNfaces and 0<= n < Np and 0<= k < K }" ], diff --git a/test/test_linalg.py b/test/test_linalg.py index ef0d0c0ca..47c7600d6 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -73,7 +73,7 @@ def test_axpy(ctx_factory): vec.make_float4(1, 2, 3, 4), vec.make_float4(6, 7, 8, 9)), (np.float32, None, 5, 7), ]: - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "[n] -> {[i]: 0<=i<n}", [ "z[i] = a*x[i]+b*y[i]" @@ -121,7 +121,7 @@ def test_transpose(ctx_factory): n = get_suitable_size(ctx) - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i,j]: 0<=i,j<%d}" % n, [ "b[i, j] = a[j, i]" @@ -155,7 +155,7 @@ def test_plain_matrix_mul(ctx_factory): (cl_array.vec.float4, check_float4, 4), (np.float32, None, 1), ]: - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" @@ -189,7 +189,7 @@ def test_variable_size_matrix_mul(ctx_factory): n = get_suitable_size(ctx) - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "[n] -> {[i,j,k]: 0<=i,j,k<n}", [ "c[i, j] = sum(k, a[i, k]*b[k, j]) {id=labl}" @@ -226,7 +226,7 @@ def test_rank_one(ctx_factory): #n = int(get_suitable_size(ctx)**(2.7/2)) n = 16**3 - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "[n] -> {[i,j]: 0<=i,j<n}", [ "c[i, j] = a[i]*b[j] {id=mylabel, priority =5}" @@ -303,7 +303,7 @@ def test_troublesome_premagma_fermi_matrix_mul(ctx_factory): n = 6*16*2 - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" @@ -340,7 +340,7 @@ def test_intel_matrix_mul(ctx_factory): n = 128+32 - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" @@ -392,7 +392,7 @@ def test_magma_fermi_matrix_mul(ctx_factory): ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D): pytest.skip("image format not supported") - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" @@ -439,7 +439,7 @@ def test_image_matrix_mul(ctx_factory): ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D): pytest.skip("image format not supported") - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" @@ -477,7 +477,7 @@ def test_image_matrix_mul_ilp(ctx_factory): n = get_suitable_size(ctx) - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" @@ -510,12 +510,11 @@ def test_image_matrix_mul_ilp(ctx_factory): @pytest.mark.skipif("sys.version_info < (2,6)") def test_ilp_race_matmul(ctx_factory): dtype = np.float32 - ctx = ctx_factory() order = "C" n = 9 - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" @@ -534,6 +533,7 @@ def test_ilp_race_matmul(ctx_factory): from loopy.diagnostic import WriteRaceConditionWarning from warnings import catch_warnings with catch_warnings(record=True) as warn_list: + knl = lp.preprocess_kernel(knl) list(lp.generate_loop_schedules(knl)) assert any(isinstance(w.message, WriteRaceConditionWarning) @@ -548,7 +548,7 @@ def test_fancy_matrix_mul(ctx_factory): n = get_suitable_size(ctx) - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "[n] -> {[i,j,k]: 0<=i,j,k<n }", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" @@ -582,7 +582,7 @@ def test_small_batched_matvec(ctx_factory): K = 9997 Np = 36 - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "[K] -> {[i,j,k]: 0<=k<K and 0<= i,j < %d}" % Np, [ "result[k, i] = sum(j, d[i, j]*f[k, j])" diff --git a/test/test_loopy.py b/test/test_loopy.py index b19df76b4..07a737900 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -52,7 +52,7 @@ __all__ = [ def test_complicated_subst(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i]: 0<=i<n}", """ f(x) := x*a[x] @@ -84,7 +84,7 @@ def test_complicated_subst(ctx_factory): def test_type_inference_no_artificial_doubles(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> bb = a[i] - b[i] @@ -98,6 +98,7 @@ def test_type_inference_no_artificial_doubles(ctx_factory): ], assumptions="n>=1") + knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): code = lp.generate_code(k) assert "double" not in code @@ -106,7 +107,7 @@ def test_type_inference_no_artificial_doubles(ctx_factory): def test_sized_and_complex_literals(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> aa = 5jf @@ -129,7 +130,7 @@ def test_sized_and_complex_literals(ctx_factory): def test_simple_side_effect(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i,j]: 0<=i,j<100}", """ a[i] = a[i] + 1 @@ -137,6 +138,7 @@ def test_simple_side_effect(ctx_factory): [lp.GlobalArg("a", np.float32, shape=(100,))] ) + knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: @@ -148,7 +150,7 @@ def test_simple_side_effect(ctx_factory): def test_nonsense_reduction(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i]: 0<=i<100}", """ a[i] = sum(i, 2) @@ -158,13 +160,13 @@ def test_nonsense_reduction(ctx_factory): import pytest with pytest.raises(RuntimeError): - list(lp.generate_loop_schedules(knl)) + knl = lp.preprocess_kernel(knl, ctx.devices[0]) def test_owed_barriers(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i]: 0<=i<100}", [ "<float32> z[i] = a[i]" @@ -174,6 +176,7 @@ def test_owed_barriers(ctx_factory): knl = lp.tag_inames(knl, dict(i="l.0")) + knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: @@ -184,7 +187,7 @@ def test_owed_barriers(ctx_factory): def test_wg_too_small(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i]: 0<=i<100}", [ "<float32> z[i] = a[i] {id=copy}" @@ -194,6 +197,7 @@ def test_wg_too_small(ctx_factory): knl = lp.tag_inames(knl, dict(i="l.0")) + knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) import pytest @@ -205,7 +209,7 @@ def test_wg_too_small(ctx_factory): def test_join_inames(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i,j]: 0<=i,j<16}", [ "b[i,j] = 2*a[i,j]" @@ -227,7 +231,7 @@ def test_join_inames(ctx_factory): def test_divisibility_assumption(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "[n] -> {[i]: 0<=i<n}", [ "b[i] = 2*a[i]" @@ -243,6 +247,7 @@ def test_divisibility_assumption(ctx_factory): knl = lp.split_iname(knl, "i", 16) + knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): code = lp.generate_code(k) assert "if" not in code @@ -254,7 +259,7 @@ def test_divisibility_assumption(ctx_factory): def test_multi_cse(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i]: 0<=i<100}", [ "<float32> z[i] = a[i] + a[i]**2" @@ -265,6 +270,7 @@ def test_multi_cse(ctx_factory): knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") knl = lp.add_prefetch(knl, "a", []) + knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: @@ -279,7 +285,7 @@ def test_stencil(ctx_factory): # non-unifiable, two-constant-segments PwAff as the base index) n = 256 - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i,j]: 0<= i,j < %d}" % n, [ "a_offset(ii, jj) := a[ii+1, jj+1]", @@ -320,7 +326,7 @@ def test_stencil(ctx_factory): def test_stencil_with_overfetch(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i,j]: 0<= i,j < n}", [ "a_offset(ii, jj) := a[ii+2, jj+2]", @@ -361,7 +367,7 @@ def test_stencil_with_overfetch(ctx_factory): def test_eq_constraint(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i,j]: 0<= i,j < 32}", [ "a[i] = b[i]" @@ -374,6 +380,7 @@ def test_eq_constraint(ctx_factory): knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") + knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for knl in kernel_gen: @@ -388,7 +395,7 @@ def test_argmax(ctx_factory): n = 10000 - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i]: 0<=i<%d}" % n, [ "<> result = argmax(i, fabs(a[i]))", @@ -489,7 +496,7 @@ def test_fuzz_code_generator(ctx_factory): else: return np.float64 - knl = lp.make_kernel(ctx.devices[0], "{ : }", + knl = lp.make_kernel("{ : }", [lp.ExpressionInstruction("value", expr)], [lp.GlobalArg("value", np.complex128, shape=())] + [ @@ -523,7 +530,7 @@ def test_empty_reduction(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( [ "{[i]: 0<=i<20}", "[i] -> {[j]: 0<=j<0}" @@ -546,7 +553,7 @@ def test_nested_dependent_reduction(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( [ "{[i]: 0<=i<n}", "{[j]: 0<=j<i+sumlen}" @@ -575,7 +582,7 @@ def test_multi_nested_dependent_reduction(ctx_factory): dtype = np.dtype(np.int32) ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( [ "{[itgt]: 0 <= itgt < ntgts}", "{[isrc_box]: 0 <= isrc_box < nboxes}", @@ -603,7 +610,7 @@ def test_recursive_nested_dependent_reduction(ctx_factory): dtype = np.dtype(np.int32) ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( [ "{[itgt]: 0 <= itgt < ntgts}", "{[isrc_box]: 0 <= isrc_box < nboxes}", @@ -632,7 +639,7 @@ def test_dependent_loop_bounds(ctx_factory): dtype = np.dtype(np.float32) ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( [ "{[i]: 0<=i<n}", "{[jj]: 0<=jj<row_len}", @@ -660,7 +667,7 @@ def test_dependent_loop_bounds_2(ctx_factory): dtype = np.dtype(np.float32) ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( [ "{[i]: 0<=i<n}", "{[jj]: 0<=jj<row_len}", @@ -696,7 +703,7 @@ def test_dependent_loop_bounds_3(ctx_factory): dtype = np.dtype(np.float32) ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( [ "{[i]: 0<=i<n}", "{[jj]: 0<=jj<row_len}", @@ -724,6 +731,8 @@ def test_dependent_loop_bounds_3(ctx_factory): knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", inner_tag="l.1") + knl = lp.preprocess_kernel(knl, ctx.devices[0]) + import pytest with pytest.raises(RuntimeError): list(lp.generate_loop_schedules(knl_bad)) @@ -734,7 +743,7 @@ def test_independent_multi_domain(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( [ "{[i]: 0<=i<n}", "{[j]: 0<=j<n}", @@ -770,7 +779,7 @@ def test_bare_data_dependency(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( [ "[znirp] -> {[i]: 0<=i<znirp}", ], @@ -799,7 +808,7 @@ def test_equality_constraints(ctx_factory): n = 10 - knl = lp.make_kernel(ctx.devices[0], [ + knl = lp.make_kernel([ "[n] -> {[i,j]: 0<=i,j<n }", "{[k]: k =i+5 and k < n}", ], @@ -833,7 +842,7 @@ def test_stride(ctx_factory): n = 10 - knl = lp.make_kernel(ctx.devices[0], [ + knl = lp.make_kernel([ "{[i]: 0<=i<n and (exists l: i = 2*l)}", ], [ @@ -859,7 +868,7 @@ def test_domain_dependency_via_existentially_quantified_variable(ctx_factory): n = 10 - knl = lp.make_kernel(ctx.devices[0], [ + knl = lp.make_kernel([ "{[i]: 0<=i<n }", "{[k]: k=i and (exists l: k = 2*l) }", ], @@ -886,9 +895,8 @@ def test_double_sum(ctx_factory): n = 20 - knl = lp.make_kernel(ctx.devices[0], [ + knl = lp.make_kernel( "{[i,j]: 0<=i,j<n }", - ], [ "a = sum((i,j), i*j)", "b = sum(i, sum(j, i*j))", @@ -910,9 +918,8 @@ def test_double_sum(ctx_factory): def test_ilp_write_race_detection_global(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], [ + knl = lp.make_kernel( "[n] -> {[i,j]: 0<=i,j<n }", - ], [ "a[i] = 5+i+j", ], @@ -924,6 +931,8 @@ def test_ilp_write_race_detection_global(ctx_factory): knl = lp.tag_inames(knl, dict(j="ilp")) + knl = lp.preprocess_kernel(knl, ctx.devices[0]) + from loopy.diagnostic import WriteRaceConditionWarning from warnings import catch_warnings with catch_warnings(record=True) as warn_list: @@ -936,7 +945,7 @@ def test_ilp_write_race_detection_global(ctx_factory): def test_ilp_write_race_avoidance_local(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i,j]: 0<=i<16 and 0<=j<17 }", [ "<> a[i] = 5+i+j", @@ -945,6 +954,7 @@ def test_ilp_write_race_avoidance_local(ctx_factory): knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) + knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): assert k.temporary_variables["a"].shape == (16, 17) @@ -952,7 +962,7 @@ def test_ilp_write_race_avoidance_local(ctx_factory): def test_ilp_write_race_avoidance_private(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[j]: 0<=j<16 }", [ "<> a = 5+j", @@ -961,6 +971,7 @@ def test_ilp_write_race_avoidance_private(ctx_factory): knl = lp.tag_inames(knl, dict(j="ilp")) + knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): assert k.temporary_variables["a"].shape == (16,) @@ -971,9 +982,8 @@ def test_write_parameter(ctx_factory): dtype = np.float32 ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], [ + knl = lp.make_kernel( "{[i,j]: 0<=i,j<n }", - ], """ a = sum((i,j), i*j) b = sum(i, sum(j, i*j)) @@ -996,9 +1006,8 @@ def test_write_parameter(ctx_factory): def test_arg_shape_guessing(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], [ + knl = lp.make_kernel( "{[i,j]: 0<=i,j<n }", - ], """ a = 1.5 + sum((i,j), i*j) b[i, j] = i*j @@ -1019,9 +1028,8 @@ def test_arg_shape_guessing(ctx_factory): def test_arg_guessing(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], [ + knl = lp.make_kernel( "{[i,j]: 0<=i,j<n }", - ], """ a = 1.5 + sum((i,j), i*j) b[i, j] = i*j @@ -1037,9 +1045,8 @@ def test_arg_guessing_with_reduction(ctx_factory): #logging.basicConfig(level=logging.DEBUG) ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], [ + knl = lp.make_kernel( "{[i,j]: 0<=i,j<n }", - ], """ a = 1.5 + sum((i,j), i*j) d = 1.5 + sum((i,j), b[i,j]) @@ -1057,9 +1064,8 @@ def test_arg_guessing_with_reduction(ctx_factory): def test_nonlinear_index(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], [ + knl = lp.make_kernel( "{[i,j]: 0<=i,j<n }", - ], """ a[i*i] = 17 """, @@ -1076,9 +1082,8 @@ def test_nonlinear_index(ctx_factory): def test_triangle_domain(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], [ + knl = lp.make_kernel( "{[i,j]: 0<=i,j<n and i <= j}", - ], "a[i,j] = 17", assumptions="n>=1") @@ -1092,9 +1097,8 @@ def test_offsets_and_slicing(ctx_factory): n = 20 - knl = lp.make_kernel(ctx.devices[0], [ + knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<m }", - ], """ b[i,j] = 2*a[i,j] """, @@ -1128,7 +1132,7 @@ def test_offsets_and_slicing(ctx_factory): def test_vector_ilp_with_prefetch(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", [ @@ -1157,7 +1161,7 @@ def test_convolution(ctx_factory): dtype = np.float32 - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \ -f_w <= f_x,f_y <= f_w \ and 0 <= im_x < im_w and 0 <= im_y < im_h \ @@ -1224,7 +1228,7 @@ def test_convolution_with_nonzero_base(ctx_factory): dtype = np.float32 - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \ -f_w <= f_x,f_y <= f_w \ and f_w <= im_x < im_w-f_w and f_w <= im_y < im_h-f_w \ @@ -1276,9 +1280,8 @@ def test_c_instruction(ctx_factory): #logging.basicConfig(level=logging.DEBUG) ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], [ + knl = lp.make_kernel( "{[i,j]: 0<=i,j<n }", - ], [ lp.CInstruction("i", """ x = sin((float) i); @@ -1301,7 +1304,7 @@ def test_c_instruction(ctx_factory): def test_dependent_domain_insn_iname_finding(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], [ + knl = lp.make_kernel([ "{[isrc_box]: 0<=isrc_box<nsrc_boxes}", "{[isrc,idim]: isrc_start<=isrc<isrc_end and 0<=idim<dim}", ], @@ -1332,11 +1335,8 @@ def test_dependent_domain_insn_iname_finding(ctx_factory): def test_inames_deps_from_write_subscript(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel(ctx.devices[0], [ + knl = lp.make_kernel( "{[i,j]: 0<=i,j<n}", - ], """ <> src_ibox = source_boxes[i] <int32> something = 5 @@ -1352,11 +1352,8 @@ def test_inames_deps_from_write_subscript(ctx_factory): def test_split_reduction(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel(ctx.devices[0], [ + knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<n}", - ], """ b = sum((i,j,k), a[i,j,k]) """, @@ -1372,9 +1369,8 @@ def test_split_reduction(ctx_factory): def test_modulo_indexing(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], [ + knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<5}", - ], """ b[i] = sum(j, a[(i+j)%n]) """, @@ -1396,7 +1392,7 @@ def test_rob_stroud_bernstein(ctx_factory): # NOTE: tmp would have to be zero-filled beforehand - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[el, i2, alpha1,alpha2]: \ 0 <= el < nels and \ 0 <= i2 < nqp1d and \ @@ -1448,7 +1444,7 @@ def test_rob_stroud_bernstein_full(ctx_factory): # NOTE: result would have to be zero-filled beforehand - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[el, i2, alpha1,alpha2, i1_2, alpha1_2, i2_2]: \ 0 <= el < nels and \ 0 <= i2 < nqp1d and \ @@ -1517,7 +1513,7 @@ def test_rob_stroud_bernstein_full(ctx_factory): def test_vector_types(ctx_factory, vec_len): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{ [i,j]: 0<=i<n and 0<=j<vec_len }", "out[i,j] = 2*a[i,j]", [ @@ -1547,7 +1543,6 @@ def test_conditional(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( - ctx.devices[0], "{ [i,j]: 0<=i,j<n }", """ <> my_a = a[i,j] {id=read_a} @@ -1577,7 +1572,7 @@ def test_ilp_loop_bound(ctx_factory): # throughout. In ILP'd loops, not so much. ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{ [i,j,k]: 0<=i,j,k<n }", """ out[i,k] = sum(j, a[i,j]*b[j,k]) @@ -1604,9 +1599,7 @@ def test_arg_shape_uses_assumptions(ctx_factory): # static shape for out, which is at least 1 x 1 in size, but otherwise of # size n x n. - ctx = ctx_factory() - - lp.make_kernel(ctx.devices[0], + lp.make_kernel( "{ [i,j]: 0<=i,j<n }", """ out[i,j] = 2*a[i,j] @@ -1618,7 +1611,7 @@ def test_slab_decomposition_does_not_double_execute(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{ [i]: 0<=i<n }", "a[i] = 2*a[i]", assumptions="n>=1") @@ -1651,7 +1644,7 @@ def test_multiple_writes_to_local_temporary(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "{[i,e]: 0<=i<5 and 0<=e<nelements}", """ <> temp[i, 0] = 17 @@ -1659,8 +1652,10 @@ def test_multiple_writes_to_local_temporary(ctx_factory): """) knl = lp.tag_inames(knl, dict(i="l.0")) - code, _ = lp.generate_code(knl) - print code + knl = lp.preprocess_kernel(knl, ctx.devices[0]) + for k in lp.generate_loop_schedules(knl): + code, _ = lp.generate_code(k) + print code if __name__ == "__main__": diff --git a/test/test_nbody.py b/test/test_nbody.py index 7ec973156..65e5658b5 100644 --- a/test/test_nbody.py +++ b/test/test_nbody.py @@ -40,17 +40,17 @@ def test_nbody(ctx_factory): dtype = np.float32 ctx = ctx_factory() - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "[N] -> {[i,j,k]: 0<=i,j<N and 0<=k<3 }", - [ - "axdist(k) := x[i,k]-x[j,k]", - "invdist := rsqrt(sum_float32(k, axdist(k)**2))", - "pot[i] = sum_float32(j, if(i != j, invdist, 0))", - ], [ - lp.GlobalArg("x", dtype, shape="N,3", order="C"), - lp.GlobalArg("pot", dtype, shape="N", order="C"), - lp.ValueArg("N", np.int32), - ], name="nbody", assumptions="N>=1") + [ + "axdist(k) := x[i,k]-x[j,k]", + "invdist := rsqrt(sum_float32(k, axdist(k)**2))", + "pot[i] = sum_float32(j, if(i != j, invdist, 0))", + ], [ + lp.GlobalArg("x", dtype, shape="N,3", order="C"), + lp.GlobalArg("pot", dtype, shape="N", order="C"), + lp.ValueArg("N", np.int32), + ], name="nbody", assumptions="N>=1") seq_knl = knl diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py index 2de1db43b..cfa23f35d 100644 --- a/test/test_sem_reagan.py +++ b/test/test_sem_reagan.py @@ -44,19 +44,19 @@ def test_tim2d(ctx_factory): field_shape = (K_sym, n, n) # K - run-time symbolic - knl = lp.make_kernel(ctx.devices[0], + knl = lp.make_kernel( "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n, - [ - "ur(a,b) := sum(o, D[a,o]*u[e,o,b])", - "us(a,b) := sum(o, D[b,o]*u[e,a,o])", + [ + "ur(a,b) := sum(o, D[a,o]*u[e,o,b])", + "us(a,b) := sum(o, D[b,o]*u[e,a,o])", - #"Gu(mat_entry,a,b) := G[mat_entry,e,m,j]*ur(m,j)", + #"Gu(mat_entry,a,b) := G[mat_entry,e,m,j]*ur(m,j)", - "Gux(a,b) := G$x[0,e,a,b]*ur(a,b)+G$x[1,e,a,b]*us(a,b)", - "Guy(a,b) := G$y[1,e,a,b]*ur(a,b)+G$y[2,e,a,b]*us(a,b)", - "lap[e,i,j] = " - " sum(m, D[m,i]*Gux(m,j))" - "+ sum(m, D[m,j]*Guy(i,m))" + "Gux(a,b) := G$x[0,e,a,b]*ur(a,b)+G$x[1,e,a,b]*us(a,b)", + "Guy(a,b) := G$y[1,e,a,b]*ur(a,b)+G$y[2,e,a,b]*us(a,b)", + "lap[e,i,j] = " + " sum(m, D[m,i]*Gux(m,j))" + "+ sum(m, D[m,j]*Guy(i,m))" ], [ -- GitLab