diff --git a/bin/loopy b/bin/loopy index 57dac8038ca638269249783151e4b195b05db223..ef4be07444d742ab9059674661f60b1fb6262467 100644 --- a/bin/loopy +++ b/bin/loopy @@ -24,6 +24,15 @@ def to_python_literal(value): # It's a float return repr(float(value)) + if value.endswith("f"): + try: + float(value[:-1]) + except ValueError: + pass + else: + # It's a float + return repr(float(value[:-1])) + return repr(value) diff --git a/doc/reference.rst b/doc/reference.rst index 3d78bee36d3e70231fd306344ba2f119529cd494..59ab3c9864d2af4f3a7b885c5c16dcc10e522a08 100644 --- a/doc/reference.rst +++ b/doc/reference.rst @@ -39,6 +39,7 @@ Loopy's expressions are a slight superset of the expressions supported by * duplication of reduction inames * complex-valued arithmetic * tagging of array access and substitution rule use ("$") +* ``indexof``, ``indexof_vec`` .. _types: diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 25cb764f3cd38c2b10b59edb71d850e28d6da2d6..24cb034630d401ba87ebc10b63c7f47aa498316f 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -997,8 +997,8 @@ transformation exists in :func:`loopy.add_prefetch`: >>> evt, (out,) = knl_pf(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... - a_fetch = a[16 * gid(0) + lid(0)]; acc_k = 0.0f; + a_fetch = a[16 * gid(0) + lid(0)]; for (int k = 0; k <= 15; ++k) acc_k = acc_k + a_fetch; out[16 * gid(0) + lid(0)] = acc_k; @@ -1021,10 +1021,10 @@ earlier: >>> evt, (out,) = knl_pf(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... - if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) - a_fetch[lid(0)] = a[lid(0) + 16 * gid(0)]; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) acc_k = 0.0f; + if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) + a_fetch[lid(0)] = a[lid(0) + 16 * gid(0)]; barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn_k_update depends on a_fetch_rule) */; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) { @@ -1459,8 +1459,8 @@ Now to make things more interesting, we'll create a kernel with barriers: { __local int c[50 * 10 * 99]; <BLANKLINE> - for (int i = 0; i <= 49; ++i) - for (int j = 0; j <= 9; ++j) + for (int j = 0; j <= 9; ++j) + for (int i = 0; i <= 49; ++i) { barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */; c[990 * i + 99 * j + lid(0) + 1 + gid(0) * 128] = 2 * a[980 * i + 98 * j + lid(0) + 1 + gid(0) * 128]; diff --git a/loopy/auto_test.py b/loopy/auto_test.py index fd4e5621343b21e78e8beeba21e21588e11a6206..d51f669ca1e9d0afb53d8c39a7a784e772075767 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -320,10 +320,6 @@ def _enumerate_cl_devices_for_ref_test(): cpu_devs = [] for pf in cl.get_platforms(): - if pf.name == "Portable Computing Language": - # pocl not mature enough yet, sadly - continue - for dev in pf.get_devices(): if dev.type & cl.device_type.CPU: cpu_devs.append(dev) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 37c8a12ee125fb26c54e84918a538c8d36e0cb4a..225f7e7fec30d38159a93df6ec4df35a45c21bc6 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -561,7 +561,7 @@ def generate_code(kernel, device=None): preamble_generators = (kernel.preamble_generators + kernel.target.preamble_generators()) for prea_gen in preamble_generators: - preambles.extend(prea_gen(kernel.target, seen_dtypes, seen_functions)) + preambles.extend(prea_gen(kernel, seen_dtypes, seen_functions)) seen_preamble_tags = set() dedup_preambles = [] diff --git a/loopy/expression.py b/loopy/expression.py index 3194ac571b1959dca77f6451c78a29096824ed48..94eaf4448f8489f55c3185cc3e3ec121bbb08993 100644 --- a/loopy/expression.py +++ b/loopy/expression.py @@ -184,6 +184,9 @@ class TypeInferenceMapper(CombineMapper): if isinstance(identifier, Variable): identifier = identifier.name + if identifier in ["indexof", "indexof_vec"]: + return self.kernel.index_dtype + arg_dtypes = tuple(self.rec(par) for par in expr.parameters) mangle_result = self.kernel.mangle_function(identifier, arg_dtypes) diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index 1cd7aa6f3e4945c306442ec09f4077d65759050d..f2bbb288249332dc6a333f95fa2600fa48967e60 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -25,7 +25,7 @@ THE SOFTWARE. from loopy.diagnostic import LoopyError -def c_preprocess(source, defines=None, filename="<floopy source>"): +def c_preprocess(source, defines=None, filename=None, include_paths=None): """ :arg source: a string, possibly containing C preprocessor constructs :arg defines: a list of strings as they might occur after a @@ -38,10 +38,22 @@ def c_preprocess(source, defines=None, filename="<floopy source>"): except ImportError: raise LoopyError("Using the C preprocessor requires PLY to be installed") + input_dirname = None + if filename is None: + filename = "<floopy source>" + else: + from os.path import dirname + input_dirname = dirname(filename) + lexer = lex.lex(cpp) from ply.cpp import Preprocessor p = Preprocessor(lexer) + if input_dirname is not None: + p.add_path(input_dirname) + if include_paths: + for inc_path in include_paths: + p.add_path(inc_path) if defines: for d in defines: diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index d2d178bc459cb0c231a78a5a1b2c3b8092d07536..13afaa66d05b8dce89a2eb3f1f06e8b752dc5420 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -302,7 +302,7 @@ class LoopKernel(RecordWithoutPickling): manglers = self.target.function_manglers() + self.function_manglers for mangler in manglers: - mangle_result = mangler(self.target, identifier, arg_dtypes) + mangle_result = mangler(self, identifier, arg_dtypes) if mangle_result is not None: return mangle_result @@ -316,7 +316,7 @@ class LoopKernel(RecordWithoutPickling): manglers = self.target.symbol_manglers() + self.symbol_manglers for mangler in manglers: - result = mangler(self.target, identifier) + result = mangler(self, identifier) if result is not None: return result diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 9a3c9c0cfbc79e9aba934b7a7c051665b19c19c5..92fb232e92a6e891efcef7022e0dceedcb4f692f 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -306,8 +306,8 @@ def parse_array_dim_tags(dim_tags, use_increasing_target_axes=False): raise LoopyError("may not mix C/F dim_tag specifications with " "explicit specification of layout nesting levels") else: - target_axis_to_has_explicit_nesting_level[parsed_dim_tag.target_axis] = \ - has_explicit_nesting_level + target_axis_to_has_explicit_nesting_level[ + parsed_dim_tag.target_axis] = has_explicit_nesting_level # }}} diff --git a/loopy/library/function.py b/loopy/library/function.py index e494169bbe5b83df852ea3d483ed3640381891f6..df623a4770f4f14a7952ee2e0edbf59939de1cfd 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -23,19 +23,19 @@ THE SOFTWARE. """ -def default_function_mangler(target, name, arg_dtypes): +def default_function_mangler(kernel, name, arg_dtypes): from loopy.library.reduction import reduction_function_mangler manglers = [reduction_function_mangler] for mangler in manglers: - result = mangler(target, name, arg_dtypes) + result = mangler(kernel, name, arg_dtypes) if result is not None: return result return None -def single_arg_function_mangler(target, name, arg_dtypes): +def single_arg_function_mangler(kernel, name, arg_dtypes): if len(arg_dtypes) == 1: dtype, = arg_dtypes return dtype, name diff --git a/loopy/schedule.py b/loopy/schedule.py index bba42381a5a4f67e957c137a2c866dd816c523c4..f22b95d45275d54d473a97a8f7a0dfde69555d6b 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -440,9 +440,6 @@ class SchedulerState(Record): *Note:* ``ilp`` and ``vec`` are not 'parallel' for the purposes of the scheduler. See :attr:`ilp_inames`, :attr:`vec_inames`. - .. attribute:: loop_priority - - .. rubric:: Time-varying scheduler state .. attribute:: active_inames @@ -536,7 +533,10 @@ def generate_loop_schedules_internal( def insn_sort_key(insn_id): insn = kernel.id_to_insn[insn_id] - return (insn.priority, len(active_groups & insn.groups)) + + # Sort by insn.id as a last criterion to achieve deterministic + # schedule generation order. + return (insn.priority, len(active_groups & insn.groups), insn.id) insn_ids_to_try = sorted(sched_state.unscheduled_insn_ids, key=insn_sort_key, reverse=True) @@ -864,7 +864,11 @@ def generate_loop_schedules_internal( found_viable_schedule = False for iname in sorted(tier, - key=lambda iname: iname_to_usefulness.get(iname, 0), + key=lambda iname: ( + iname_to_usefulness.get(iname, 0), + # Sort by iname to achieve deterministic + # ordering of generated schedules. + iname), reverse=True): for sub_sched in generate_loop_schedules_internal( diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 71844cfbf057c4533669dfbdce71e13f5e0a1ca3..112e7e5b95b3354092a6ced076a9a03140c87105 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -35,6 +35,7 @@ import islpy as isl from loopy.expression import dtype_to_type_context, TypeInferenceMapper +from loopy.diagnostic import LoopyError from loopy.tools import is_integer @@ -74,6 +75,22 @@ class LoopyCCodeMapper(RecursiveMapper): else: return s + def find_array(self, expr): + if expr.aggregate.name in self.kernel.arg_dict: + ary = self.kernel.arg_dict[expr.aggregate.name] + elif expr.aggregate.name in self.kernel.temporary_variables: + ary = self.kernel.temporary_variables[expr.aggregate.name] + else: + raise RuntimeError("nothing known about subscripted variable '%s'" + % expr.aggregate.name) + + from loopy.kernel.array import ArrayBase + if not isinstance(ary, ArrayBase): + raise RuntimeError("subscripted variable '%s' is not an array" + % expr.aggregate.name) + + return ary + def rec(self, expr, prec, type_context=None, needed_dtype=None): if needed_dtype is None: return RecursiveMapper.rec(self, expr, prec, type_context) @@ -150,18 +167,7 @@ class LoopyCCodeMapper(RecursiveMapper): if not isinstance(expr.aggregate, Variable): return base_impl(expr, enclosing_prec, type_context) - if expr.aggregate.name in self.kernel.arg_dict: - ary = self.kernel.arg_dict[expr.aggregate.name] - elif expr.aggregate.name in self.kernel.temporary_variables: - ary = self.kernel.temporary_variables[expr.aggregate.name] - else: - raise RuntimeError("nothing known about subscripted variable '%s'" - % expr.aggregate.name) - - from loopy.kernel.array import ArrayBase - if not isinstance(ary, ArrayBase): - raise RuntimeError("subscripted variable '%s' is not an array" - % expr.aggregate.name) + ary = self.find_array(expr) from loopy.kernel.array import get_access_info from pymbolic import evaluate @@ -367,11 +373,54 @@ class LoopyCCodeMapper(RecursiveMapper): "for constant '%s'" % expr) def map_call(self, expr, enclosing_prec, type_context): - from pymbolic.primitives import Variable + from pymbolic.primitives import Variable, Subscript from pymbolic.mapper.stringifier import PREC_NONE identifier = expr.function + # {{{ implement indexof, indexof_vec + + if identifier.name in ["indexof", "indexof_vec"]: + if len(expr.parameters) != 1: + raise LoopyError("%s takes exactly one argument" % identifier.name) + arg, = expr.parameters + if not isinstance(arg, Subscript): + raise LoopyError( + "argument to %s must be a subscript" % identifier.name) + + ary = self.find_array(arg) + + from loopy.kernel.array import get_access_info + from pymbolic import evaluate + access_info = get_access_info(self.kernel.target, ary, arg.index, + lambda expr: evaluate(expr, self.codegen_state.var_subst_map), + self.codegen_state.vectorization_info) + + from loopy.kernel.data import ImageArg + if isinstance(ary, ImageArg): + raise LoopyError("%s does not support images" % identifier.name) + + if identifier.name == "indexof": + return access_info.subscripts[0] + elif identifier.name == "indexof_vec": + from loopy.kernel.array import VectorArrayDimTag + ivec = None + for iaxis, dim_tag in enumerate(ary.dim_tags): + if isinstance(dim_tag, VectorArrayDimTag): + ivec = iaxis + + if ivec is None: + return access_info.subscripts[0] + else: + return ( + access_info.subscripts[0]*ary.shape[ivec] + + access_info.vector_index) + + else: + raise RuntimeError("should not get here") + + # }}} + c_name = None if isinstance(identifier, Variable): identifier = identifier.name diff --git a/loopy/target/opencl/__init__.py b/loopy/target/opencl/__init__.py index eebe6f5da0b81fa9b4c1ac7b4cda0ba8b1ac283e..d038c329a9eff73a95458fde44078c26f3dbbc56 100644 --- a/loopy/target/opencl/__init__.py +++ b/loopy/target/opencl/__init__.py @@ -105,7 +105,7 @@ def _register_vector_types(dtype_registry): # {{{ function mangler -def opencl_function_mangler(target, name, arg_dtypes): +def opencl_function_mangler(kernel, name, arg_dtypes): if not isinstance(name, str): return None @@ -134,7 +134,7 @@ def opencl_function_mangler(target, name, arg_dtypes): # {{{ symbol mangler -def opencl_symbol_mangler(target, name): +def opencl_symbol_mangler(kernel, name): # FIXME: should be more picky about exact names if name.startswith("FLT_"): return np.dtype(np.float32), name @@ -155,7 +155,7 @@ def opencl_symbol_mangler(target, name): # {{{ preamble generator -def opencl_preamble_generator(target, seen_dtypes, seen_functions): +def opencl_preamble_generator(kernel, seen_dtypes, seen_functions): has_double = False for dtype in seen_dtypes: @@ -229,7 +229,7 @@ class OpenCLTarget(CTarget): def is_vector_dtype(self, dtype): return list(vec.types.values()) - def get_vector_dtype(self, base, count): + def vector_dtype(self, base, count): return vec.types[base, count] def wrap_function_declaration(self, kernel, fdecl): diff --git a/setup.cfg b/setup.cfg index 2dc94705f792ddca37ca7bf1f8ddec67be7ed3b9..d34ecdd6d469b931929187c36be61a2ba4be25ad 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,3 @@ [flake8] -ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503 +ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402 max-line-length=85 diff --git a/test/test_loopy.py b/test/test_loopy.py index d7d7dc5768f163a84306ef234af08cf620c4066c..c8072108032e82517773709f5f5fd257928d3bd9 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2191,6 +2191,43 @@ def test_variable_size_temporary(): lp.generate_code(k) +def test_indexof(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + ''' { [i,j]: 0<=i,j<5 } ''', + ''' out[i,j] = indexof(out[i,j])''') + + knl = lp.set_options(knl, write_cl=True) + + (evt, (out,)) = knl(queue) + out = out.get() + + assert np.array_equal(out.ravel(order="C"), np.arange(25)) + + +def test_indexof_vec(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + if ctx.devices[0].platform.name.startswith("Portable"): + # Accurate as of 2015-10-08 + pytest.skip("POCL miscompiles vector code") + + knl = lp.make_kernel( + ''' { [i,j,k]: 0<=i,j,k<4 } ''', + ''' out[i,j,k] = indexof_vec(out[i,j,k])''') + + knl = lp.tag_inames(knl, {"i": "vec"}) + knl = lp.tag_data_axes(knl, "out", "vec,c,c") + knl = lp.set_options(knl, write_cl=True) + + (evt, (out,)) = knl(queue) + #out = out.get() + #assert np.array_equal(out.ravel(order="C"), np.arange(25)) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])