diff --git a/MEMO b/MEMO index 02eaf2264a85c6a81c840c091dc90d288df25dc0..3e4be2755f1aee8932850938103155ed379cd1af 100644 --- a/MEMO +++ b/MEMO @@ -41,6 +41,10 @@ Things to consider To-do ^^^^^ +- Multi-domain + +- Kernel splitting (via what variables get computed in a kernel) + - What if no universally valid precompute base index expression is found? (test_intel_matrix_mul with n = 6*16, e.g.?) @@ -66,6 +70,8 @@ To-do Future ideas ^^^^^^^^^^^^ +- Array language + - reg rolling - When duplicating inames, use iname aliases to relieve burden on isl diff --git a/loopy/codegen/expression.py b/loopy/codegen/expression.py index fad00937348d7bb419c2ae802f729a2054395ed5..5a387a68e64bdfca4fb65005605fc2606c94e779 100644 --- a/loopy/codegen/expression.py +++ b/loopy/codegen/expression.py @@ -90,6 +90,12 @@ class TypeInferenceMapper(CombineMapper): if expr.name in self.kernel.all_inames(): return np.dtype(np.int16) # don't force single-precision upcast + for mangler in self.kernel.symbol_manglers: + result = mangler(expr.name) + if result is not None: + result_dtype, _ = result + return result_dtype + raise TypeInferenceFailure("nothing known about '%s'" % expr.name) def map_lookup(self, expr): @@ -162,14 +168,19 @@ class LoopyCCodeMapper(CCodeMapper): self.rec(self.var_subst_map[expr.name], prec)) else: return str(self.rec(self.var_subst_map[expr.name], prec)) - else: - if expr.name in self.kernel.arg_dict: - arg = self.kernel.arg_dict[expr.name] - from loopy.kernel import _ShapedArg - if isinstance(arg, _ShapedArg) and arg.shape == (): - return "*"+expr.name - - return CCodeMapper.map_variable(self, expr, prec) + elif expr.name in self.kernel.arg_dict: + arg = self.kernel.arg_dict[expr.name] + from loopy.kernel import _ShapedArg + if isinstance(arg, _ShapedArg) and arg.shape == (): + return "*"+expr.name + + for mangler in self.kernel.symbol_manglers: + result = mangler(expr.name) + if result is not None: + _, c_name = result + return c_name + + return CCodeMapper.map_variable(self, expr, prec) def map_tagged_variable(self, expr, enclosing_prec): return expr.name diff --git a/loopy/compiled.py b/loopy/compiled.py index 88382ba3842b79b5516719a3dbcf77c051da76bf..f2ac06851edfd0c35afb3248f8679b334fcf5b89 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -153,6 +153,15 @@ class CompiledKernel: is_written = arg.name in self.kernel.get_written_variables() val = kwargs_copy.pop(arg.name, None) + + # automatically transfer host-side arrays + if isinstance(arg, lp.GlobalArg): + if isinstance(val, np.ndarray): + # synchronous, so nothing to worry about + val = cl_array.to_device(queue, val, allocator=allocator) + elif val is not None: + encountered_non_numpy = True + if val is None: if not is_written: raise TypeError("must supply input argument '%s'" % arg.name) @@ -168,14 +177,6 @@ class CompiledKernel: else: assert _arg_matches_spec(arg, val, kwargs) - # automatically transfer host-side arrays - if isinstance(arg, lp.GlobalArg): - if isinstance(val, np.ndarray): - # synchronous, so nothing to worry about - val = cl_array.to_device(queue, val, allocator=allocator) - else: - encountered_non_numpy = True - if is_written: outputs.append(val) @@ -196,7 +197,7 @@ class CompiledKernel: *args, g_times_l=True, wait_for=wait_for) - if out_host is None and encountered_non_numpy: + if out_host is None and not encountered_non_numpy: out_host = True if out_host: outputs = [o.get() for o in outputs] diff --git a/loopy/kernel.py b/loopy/kernel.py index 051b57fec954b1ece1c30f8c9296ac503a880fde..de67d95722336a0c4e292dfe5c81778e225cd2e1 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -114,7 +114,7 @@ def parse_tag(tag): # {{{ arguments -class _ShapedArg: +class _ShapedArg(object): def __init__(self, name, dtype, strides=None, shape=None, order="C", offset=0): """ @@ -192,7 +192,7 @@ class ConstantArg(_ShapedArg): return "<ConstantArg '%s' of type %s and shape (%s)>" % ( self.name, self.dtype, ",".join(str(i) for i in self.shape)) -class ImageArg: +class ImageArg(object): def __init__(self, name, dtype, dimensions=None, shape=None): self.name = name self.dtype = np.dtype(dtype) @@ -211,7 +211,7 @@ class ImageArg: return "<ImageArg '%s' of type %s>" % (self.name, self.dtype) -class ScalarArg: +class ScalarArg(object): def __init__(self, name, dtype, approximately=None): self.name = name self.dtype = np.dtype(dtype) @@ -421,7 +421,7 @@ def expand_defines(insn, defines): # }}} -# {{{ function manglers +# {{{ function manglers / dtype getters def default_function_mangler(name, arg_dtypes): from loopy.reduction import reduction_function_mangler @@ -441,6 +441,20 @@ def single_arg_function_mangler(name, arg_dtypes): return None +def opencl_symbol_mangler(name): + # FIXME: should be more picky about exact names + if name.startswith("FLT_"): + return np.dtype(np.float32), name + elif name.startswith("DBL_"): + return np.dtype(np.float64), name + elif name.startswith("M_"): + if name.endswith("_F"): + return np.dtype(np.float32), name + else: + return np.dtype(np.float64), name + else: + return None + # }}} # {{{ preamble generators @@ -505,8 +519,11 @@ class LoopKernel(Record): :ivar substitutions: a mapping from substitution names to :class:`SubstitutionRule` objects :ivar function_manglers: list of functions of signature (name, arg_dtypes) - returning a tuple (result_dtype, function_name), where the function_name + returning a tuple (result_dtype, c_name), where c_name is the C-level function to be called. + :ivar symbol_manglers: list of functions of signature (name) returning + a tuple (result_dtype, c_name), where c_name is the C-level symbol to be + evaluated. :ivar defines: a dictionary of replacements to be made in instructions given as strings before parsing. A macro instance intended to be replaced should look like "{MACRO}" in the instruction code. The expansion given in this @@ -541,6 +558,7 @@ class LoopKernel(Record): iname_to_tag={}, substitutions={}, function_manglers=[default_function_mangler, single_arg_function_mangler], + symbol_manglers=[opencl_symbol_mangler], defines={}, # non-user-facing @@ -758,7 +776,8 @@ class LoopKernel(Record): lowest_priority_inames=lowest_priority_inames, breakable_inames=breakable_inames, applied_iname_rewrites=applied_iname_rewrites, - function_manglers=function_manglers) + function_manglers=function_manglers, + symbol_manglers=symbol_manglers) def make_unique_instruction_id(self, insns=None, based_on="insn", extra_used_ids=set()): if insns is None: