diff --git a/MEMO b/MEMO index a281b98085a5ce9e37102cbf94231596e6563b79..05eaf2de75a810bd9ff726646ac8ae53d56bd382 100644 --- a/MEMO +++ b/MEMO @@ -34,6 +34,8 @@ Things to consider - Measure efficiency of corner cases +- Loopy as a data model for implementing custom rewritings + To-do ^^^^^ @@ -45,12 +47,18 @@ To-do - user interface for dim length prescription +- Way too many barriers in SEM test. + - Deal with equality constraints. (These arise, e.g., when partitioning a loop of length 16 into 16s.) Future ideas ^^^^^^^^^^^^ +- Float4 joining on fetch/store? + +- How can one automatically generate something like microblocks? + - Better for loop bound generation -> Try a triangular loop diff --git a/loopy/kernel.py b/loopy/kernel.py index 3c3f25751d7f0acdb04b147927f09b6e6e2acf37..1cc356f77d4d19a4c9c334a1de2bce49a83b4f63 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -699,7 +699,8 @@ class LoopKernel(Record): if all_inames_by_insns != self.all_inames(): raise RuntimeError("inames collected from instructions (%s) " "do not match domain inames (%s)" - % (", ".join(all_inames_by_insns), ", ".join(self.all_inames()))) + % (", ".join(sorted(all_inames_by_insns)), + ", ".join(sorted(self.all_inames())))) global_sizes = {} local_sizes = {} @@ -780,11 +781,13 @@ class LoopKernel(Record): def __str__(self): lines = [] - for insn in self.instructions: - lines.append(str(insn)) - lines.append("") for iname in sorted(self.all_inames()): lines.append("%s: %s" % (iname, self.iname_to_tag.get(iname))) + lines.append("") + lines.append(str(self.domain)) + lines.append("") + for insn in self.instructions: + lines.append(str(insn)) return "\n".join(lines) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 15573cf251c52557a2baf6158063bb640f2bf8f7..990a4fc7f2cb526b73a7afa514f9e571c85c0c40 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -6,6 +6,86 @@ import pyopencl.characterize as cl_char +# {{{ make reduction variables unique + +def make_reduction_variables_unique(kernel): + # {{{ count number of uses of each reduction iname + + def count_reduction_iname_uses(expr, rec): + rec(expr.expr) + for iname in expr.inames: + reduction_iname_uses[iname] = ( + reduction_iname_uses.get(iname, 0) + + 1) + + from loopy.symbolic import ReductionCallbackMapper + cb_mapper = ReductionCallbackMapper(count_reduction_iname_uses) + + reduction_iname_uses = {} + + for insn in kernel.instructions: + cb_mapper(insn.expression) + + # }}} + + # {{{ make iname uses in reduction unique + + def ensure_reduction_iname_uniqueness(expr, rec): + child = rec(expr.expr) + my_created_inames = [] + new_red_inames = [] + + for iname in expr.inames: + if reduction_iname_uses[iname] > 1: + new_iname = kernel.make_unique_var_name(iname, set(new_inames)) + + old_inames.append(iname) + new_inames.append(new_iname) + my_created_inames.append(new_iname) + new_red_inames.append(new_iname) + reduction_iname_uses[iname] -= 1 + else: + new_red_inames.append(iname) + + if my_created_inames: + from loopy.symbolic import SubstitutionMapper + from pymbolic.mapper.substitutor import make_subst_func + from pymbolic import var + subst_dict = dict( + (old_iname, var(new_iname)) + for old_iname, new_iname in zip(expr.inames, my_created_inames)) + subst_map = SubstitutionMapper(make_subst_func(subst_dict)) + + child = subst_map(child) + + from loopy.symbolic import Reduction + return Reduction( + operation=expr.operation, + inames=tuple(new_red_inames), + expr=child) + + new_insns = [] + old_inames = [] + new_inames = [] + + from loopy.symbolic import ReductionCallbackMapper + cb_mapper = ReductionCallbackMapper(ensure_reduction_iname_uniqueness) + + new_insns = [ + insn.copy(expression=cb_mapper(insn.expression)) + for insn in kernel.instructions] + + domain = kernel.domain + from loopy.isl_helpers import duplicate_axes + for old, new in zip(old_inames, new_inames): + domain = duplicate_axes(domain, [old], [new]) + + return kernel.copy(instructions=new_insns, domain=domain) + + # }}} + +# }}} + # {{{ rewrite reduction to imperative form def realize_reduction(kernel): @@ -466,6 +546,7 @@ def adjust_local_temp_var_storage(kernel): def preprocess_kernel(kernel): + kernel = make_reduction_variables_unique(kernel) kernel = realize_reduction(kernel) # {{{ check that all CSEs have been realized diff --git a/loopy/symbolic.py b/loopy/symbolic.py index bb4fd83fb22bcbe46a72c23db2d266ab1ba472b4..5e08fbd44a12b031361e29dcbc29d3a64c12e790 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -514,7 +514,7 @@ class IndexVariableFinder(CombineMapper): result = self.rec(expr.expr) if not (set(expr.inames) & result): raise RuntimeError("reduction '%s' does not depend on " - "reduction inames" % expr) + "reduction inames (%s)" % (expr, ",".join(expr.inames))) if self.include_reduction_inames: return result else: diff --git a/test/test_sem.py b/test/test_sem.py index bdc7d678ed66dae150acdd5e75c4939b5352c983..1d9f530e96638324ec7c81b0cd8b1973071b9ca4 100644 --- a/test/test_sem.py +++ b/test/test_sem.py @@ -4,7 +4,6 @@ import numpy as np import numpy.linalg as la import pyopencl as cl import pyopencl.array as cl_array -import pyopencl.clrandom as cl_random import loopy as lp from pyopencl.tools import pytest_generate_tests_for_pyopencl \ @@ -287,16 +286,16 @@ def test_sem_3d(ctx_factory): # K - run-time symbolic n = 8 knl = lp.make_kernel(ctx.devices[0], - "[K] -> {[i,j,k,e,m,mp]: 0<=i,j,k,m<%d and 0<=e<K}" % n, + "[K] -> {[i,j,k,e,m]: 0<=i,j,k,m<%d and 0<=e<K}" % n, [ "[|i,j,k] <float32> ur[i,j,k] = sum_float32(m, D[i,m]*u[m,j,k,e])", "[|i,j,k] <float32> us[i,j,k] = sum_float32(m, D[j,m]*u[i,m,k,e])", "[|i,j,k] <float32> ut[i,j,k] = sum_float32(m, D[k,m]*u[i,j,m,e])", "lap[i,j,k,e] = " - " sum_float32(m, D[m,i]*(G[0,m,j,k,e]*ur[m,j,k,e] + G[1,m,j,k,e]*us[m,j,k,e] + G[2,m,j,k,e]*ut[m,j,k,e]))" - "+ sum_float32(m, D[m,j]*(G[1,i,m,k,e]*ur[i,m,k,e] + G[3,i,m,k,e]*us[i,m,k,e] + G[4,i,m,k,e]*ut[i,m,k,e]))" - "+ sum_float32(m, D[m,k]*(G[2,i,j,m,e]*ur[i,j,m,e] + G[4,i,j,m,e]*us[i,j,m,e] + G[5,i,j,m,e]*ut[i,j,m,e]))" + " sum_float32(m, D[m,i]*(G[0,m,j,k,e]*ur[m,j,k] + G[1,m,j,k,e]*us[m,j,k] + G[2,m,j,k,e]*ut[m,j,k]))" + "+ sum_float32(m, D[m,j]*(G[1,i,m,k,e]*ur[i,m,k] + G[3,i,m,k,e]*us[i,m,k] + G[4,i,m,k,e]*ut[i,m,k]))" + "+ sum_float32(m, D[m,k]*(G[2,i,j,m,e]*ur[i,j,m] + G[4,i,j,m,e]*us[i,j,m] + G[5,i,j,m,e]*ut[i,j,m]))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), @@ -307,17 +306,18 @@ def test_sem_3d(ctx_factory): ], name="semlap", assumptions="K>=1") - print knl + #print knl #for tv in knl.temporary_variables.iteritems(): #print tv - 1/0 + #1/0 knl = lp.split_dimension(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) #knl = lp.split_dimension(knl, "e_inner", 4, inner_tag="ilp") knl = lp.tag_dimensions(knl, dict(i="l.0", j="l.1")) #knl = lp.realize_cse(knl, "build_ur", np.float32, ["j", "k"]) - knl = lp.realize_cse(knl, "build_ur", np.float32, ["j", "k", "mp"]) - print knl + #knl = lp.realize_cse(knl, "build_ur", np.float32, ["j", "k", "mp"]) + knl = lp.preprocess_kernel(knl) + #print knl #1/0 kernel_gen = lp.generate_loop_schedules(knl) @@ -343,10 +343,6 @@ def test_sem_3d(ctx_factory): if __name__ == "__main__": - # make sure that import failures get reported, instead of skipping the - # tests. - import pyopencl as cl - import sys if len(sys.argv) > 1: exec(sys.argv[1])