diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 87daa9fc4fc01b0625066cfd7c934c046b546930..fd884f1940fb7916108c8b165ffd9d0c817766e2 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -923,6 +923,7 @@ Consider the following example: ... out[16*i_outer + i_inner] = sum(k, a_temp[k]) ... """) >>> knl = lp.tag_inames(knl, dict(i_outer="g.0", i_inner="l.0")) + >>> knl = lp.set_temporary_scope(knl, "a_temp", "local") >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) @@ -1479,7 +1480,7 @@ Now to make things more interesting, we'll create a kernel with barriers: ... "..." ... ]) >>> knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32)) - >>> knl = lp.split_iname(knl, "k", 128, outer_tag="g.0", inner_tag="l.0") + >>> knl = lp.split_iname(knl, "k", 128, inner_tag="l.0") >>> code, _ = lp.generate_code(lp.preprocess_kernel(knl)) >>> print(code) #define lid(N) ((int) get_local_id(N)) @@ -1488,6 +1489,8 @@ Now to make things more interesting, we'll create a kernel with barriers: __kernel void __attribute__ ((reqd_work_group_size(97, 1, 1))) loopy_kernel(__global int const *__restrict__ a, __global int *__restrict__ e) { __local int c[50 * 10 * 99]; + + int const k_outer = 0; for (int j = 0; j <= 9; ++j) for (int i = 0; i <= 49; ++i) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index e0ec0df31dac5cf3ee470d5e6337060cd84a5099..c26dc28a5e9f95331b849a12de1d1756088db859 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1200,7 +1200,7 @@ class LoopKernel(RecordWithoutPickling): if lhs: core = "%s <- %s" % ( - Fore.BLUE+lhs+Style.RESET_ALL, + Fore.CYAN+lhs+Style.RESET_ALL, Fore.MAGENTA+rhs+Style.RESET_ALL, ) else: diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 09437f213aa498d86a562f4363b15037fb396d90..c0f42e55aaf7710a8a91781cb2f0d0af905871dd 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -315,7 +315,7 @@ def _get_assignee_inames_tagged(kernel, insn, tag_base, tv_names): def find_temporary_scope(kernel): - logger.debug("%s: mark local temporaries" % kernel.name) + logger.debug("%s: find temporary scope" % kernel.name) new_temp_vars = {} from loopy.kernel.data import (LocalIndexTagBase, GroupIndexTag, @@ -388,8 +388,10 @@ def find_temporary_scope(kernel): grpparallel_compute_inames, temp_var_scope.GLOBAL), ]: - if (apin != cpin and bool(locparallel_assignee_inames)): - warn_with_kernel(kernel, "write_race_local(%s)" % insn_id, + if (apin != cpin and bool(apin)): + warn_with_kernel( + kernel, + "write_race_%s(%s)" % (scope_descr, insn_id), "instruction '%s' looks invalid: " "it assigns to indices based on %s IDs, but " "its temporary '%s' cannot be made %s because " @@ -405,7 +407,6 @@ def find_temporary_scope(kernel): # parallel inames of that kind: and bool(cpin)): desired_scope = max(desired_scope, scope) - break desired_scope_per_insn.append(desired_scope) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index d6eb1a32cdd32b2adaed08f92f4bb63d9501e5ce..ae9ebacf5ca798c58f87234e72e5f64187f7afc6 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -372,7 +372,7 @@ def format_insn(kernel, insn_id): Style = kernel.options._style return "[%s] %s%s%s <- %s%s%s" % ( format_insn_id(kernel, insn_id), - Fore.BLUE, ", ".join(str(a) for a in insn.assignees), Style.RESET_ALL, + Fore.CYAN, ", ".join(str(a) for a in insn.assignees), Style.RESET_ALL, Fore.MAGENTA, str(insn.expression), Style.RESET_ALL) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index bdd5773b3f68d631343871adee423144086ca71a..b245c44d33b3a601855f67777601b1397dcd6efa 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -35,7 +35,7 @@ from loopy.kernel.data import CallMangleInfo from loopy.target.opencl import OpenCLTarget, OpenCLCASTBuilder from loopy.target.python import PythonASTBuilderBase from loopy.types import NumpyType -from loopy.diagnostic import LoopyError +from loopy.diagnostic import LoopyError, warn_with_kernel from warnings import warn import logging @@ -172,9 +172,17 @@ def check_sizes(kernel, device): if product(llens) > device.max_work_group_size: raise LoopyError("work group too big") + local_mem_use = kernel.local_mem_use() + from pyopencl.characterize import usable_local_mem_size - if kernel.local_mem_use() > usable_local_mem_size(device): - raise LoopyError("using too much local memory") + import numbers + if isinstance(local_mem_use, numbers.Integral): + if local_mem_use > usable_local_mem_size(device): + raise LoopyError("using too much local memory") + else: + warn_with_kernel(kernel, "non_constant_local_mem", + "The amount of local memory used by the kernel " + "is not a constant. This will likely cause problems.") from loopy.kernel.data import ConstantArg const_arg_count = sum( diff --git a/test/test_statistics.py b/test/test_statistics.py index 4bcacf59eb5752800cee2322844d6e33968dc91e..68be5b8a260858e058619c796b3836611c8d4f0f 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -538,7 +538,7 @@ def test_barrier_counter_barriers(): name="weird2", ) knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32)) - knl = lp.split_iname(knl, "k", 128, outer_tag="g.0", inner_tag="l.0") + knl = lp.split_iname(knl, "k", 128, inner_tag="l.0") poly = lp.get_synchronization_poly(knl) print(poly) n = 512