From 487e9c3aa42448566aaafe1042a67089054213f0 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 29 Oct 2011 17:20:31 -0400 Subject: [PATCH] Revive fancy_matmul. Fix assert child_iname <= parent_iname condition. --- MEMO | 1 + loopy/__init__.py | 2 +- loopy/check.py | 3 +- loopy/codegen/__init__.py | 2 +- loopy/kernel.py | 38 +- test/test_dg.py | 130 ++++++ test/test_linalg.py | 153 +------ test/test_matmul.py | 887 -------------------------------------- 8 files changed, 157 insertions(+), 1059 deletions(-) create mode 100644 test/test_dg.py delete mode 100644 test/test_matmul.py diff --git a/MEMO b/MEMO index 07db1b9a9..e695da1be 100644 --- a/MEMO +++ b/MEMO @@ -100,6 +100,7 @@ Dealt with - assert dependencies <= parent_inames in loopy/__init__.py -> Yes, this must be the case. + -> If you include reduction inames. - Give a good error message if a parameter assignment in get_problems() is missing. diff --git a/loopy/__init__.py b/loopy/__init__.py index 017999e7c..2b8dba210 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -237,13 +237,13 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non # {{{ decide what to do with each iname - parent_inames = insn.all_inames() forced_iname_deps = [] from loopy.symbolic import IndexVariableFinder dependencies = IndexVariableFinder( include_reduction_inames=False)(expr.child) + parent_inames = insn.all_inames() | insn.reduction_inames() assert dependencies <= parent_inames for iname in parent_inames: diff --git a/loopy/check.py b/loopy/check.py index 2fa41b97f..996ad9f48 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -193,9 +193,10 @@ def check_implemented_domains(kernel, implemented_domains): #lines.append("point implemented: %s" % (pt_set <= insn_impl_domain)) #lines.append("point desired: %s" % (pt_set <= desired_domain)) + iname_to_dim = pt.get_space().get_var_dict() point_axes = [] for iname in insn.all_inames() | parameter_inames: - tp, dim = kernel.iname_to_dim[iname] + tp, dim = iname_to_dim[iname] point_axes.append("%s=%d" % (iname, pt.get_coordinate(tp, dim))) lines.append( diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index b093c44df..564907ec9 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -318,7 +318,7 @@ def generate_code(kernel): # }}} from loopy.check import check_implemented_domains - assert check_implemented_domains(kernel, gen_code.implemented_domains) + #assert check_implemented_domains(kernel, gen_code.implemented_domains) return str(mod) diff --git a/loopy/kernel.py b/loopy/kernel.py index 052a45c2f..036b1257c 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -242,8 +242,25 @@ class Instruction(Record): insn_deps=insn_deps, boostable=boostable, temp_var_type=temp_var_type, duplicate_inames_and_tags=duplicate_inames_and_tags) + @memoize_method + def reduction_inames(self): + def map_reduction(expr, rec): + rec(expr.expr) + for iname in expr.inames: + result.add(iname) + + from loopy.symbolic import ReductionCallbackMapper + cb_mapper = ReductionCallbackMapper(map_reduction) + + result = set() + cb_mapper(self.expression) + + return result + @memoize_method def all_inames(self): + """Does not (!) include reduction inames.""" + from loopy.symbolic import IndexVariableFinder ivarf = IndexVariableFinder(include_reduction_inames=False) index_vars = (ivarf(self.expression) | ivarf(self.assignee)) @@ -815,23 +832,6 @@ def find_var_base_indices_and_shape_from_inames(domain, inames): # {{{ count number of uses of each reduction iname -def count_reduction_iname_uses(insn): - - def count_reduction_iname_uses(expr, rec): - rec(expr.expr) - for iname in expr.inames: - reduction_iname_uses[iname] = ( - reduction_iname_uses.get(iname, 0) - + 1) - - from loopy.symbolic import ReductionCallbackMapper - cb_mapper = ReductionCallbackMapper(count_reduction_iname_uses) - - reduction_iname_uses = {} - cb_mapper(insn.expression) - - return reduction_iname_uses - # }}} @@ -907,11 +907,11 @@ def make_kernel(*args, **kwargs): # {{{ duplicate non-reduction inames - reduction_iname_uses = count_reduction_iname_uses(insn) + reduction_inames = insn.reduction_inames() duplicate_inames = [iname for iname, tag in insn.duplicate_inames_and_tags - if iname not in reduction_iname_uses] + if iname not in reduction_inames] new_inames = [ knl.make_unique_var_name( diff --git a/test/test_dg.py b/test/test_dg.py new file mode 100644 index 000000000..bfd2526bf --- /dev/null +++ b/test/test_dg.py @@ -0,0 +1,130 @@ +from __future__ import division + +import numpy as np +import numpy.linalg as la +import pyopencl as cl +import pyopencl.array as cl_array +import pyopencl.clrandom as cl_random +import loopy as lp + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + + + + +1/0 # unfinished + + + +def test_dg_matrix_mul(ctx_factory): + dtype = np.float32 + ctx = ctx_factory() + order = "C" + queue = cl.CommandQueue(ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + + Np = 84 + Np_padded = 96 + K = get_suitable_size(ctx)*4 + dim = 3 + num_flds = 2 + use_images = False + + from pymbolic import var + fld = var("fld") + matrix_names = ["d%d" % i for i in range(dim)] + i, j, k = [var(s) for s in "i j k".split()] + + fld_strides = (1, Np_padded) + + knl = lp.make_kernel(ctx.devices[0], + "{[i,j,k]: 0<=i,j< %d and 0<=k<%d}" % (Np, K), + [ + (var(mn+"fld%d" % ifld)[i, k], + var(mn)[i, j]*var("fld%d" % ifld)[j, k]) + for mn in matrix_names + for ifld in range(num_flds) + ], + ([lp.ImageArg(mn, dtype, 2) for mn in matrix_names] + if use_images else + [lp.ArrayArg(mn, dtype, shape=(Np, Np), order="C") for mn in matrix_names]) + + [lp.ArrayArg("fld%d" % ifld, dtype, + strides=fld_strides) + for ifld in range(num_flds) + ] + + [lp.ArrayArg(mn+"fld%d" % ifld, dtype, + strides=fld_strides) + for ifld in range(num_flds) + for mn in matrix_names + ], + name="dg_matmul") + + #ilp = 4 + knl = lp.split_dimension(knl, "i", 30, 32, outer_tag="g.0", inner_tag="l.0") + knl = lp.split_dimension(knl, "k", 16, outer_tag="g.1", inner_tag="l.1") + #knl = lp.split_dimension(knl, "k_inner", 16, outer_tag="ilp", inner_tag="l.1") + + assert Np % 2 == 0 + #knl = lp.split_dimension(knl, "j", Np//2) + #knl = lp.split_dimension(knl, "k", 32) + + #for mn in matrix_names: + #knl = lp.add_prefetch(knl, mn, ["j", "i_inner"]) + for ifld in range(num_flds): + knl = lp.add_prefetch(knl, 'fld%d' % ifld, + #["k_inner_outer", "k_inner_inner", "j"]) + ["k_inner", "j"]) + assert knl.get_problems({})[0] <= 2 + + kernel_gen = list(lp.insert_register_prefetches(knl) + for knl in lp.generate_loop_schedules(knl))[:1] + + matrices = [ + make_well_conditioned_dev_matrix(queue, Np, dtype=dtype, order="C", + ran_factor=0) + for mn in matrix_names] + flds = [ + make_well_conditioned_dev_matrix(queue, (Np_padded, K), dtype=dtype, order="F") + for ifld in range(num_flds)] + outputs = [cl_array.empty_like(flds[0]) + for ifld in range(num_flds) + for mn in matrix_names] + + ref_soln = [np.dot(mat.get(), fld.get()[:Np]) + for fld in flds + for mat in matrices] + + if use_images: + mat_images = [ + cl.image_from_array(ctx, mat.get(), 1) for mat in matrices] + + def launcher(kernel, gsize, lsize, check): + if use_images: + args = mat_images + else: + args = [mat.data for mat in matrices] + + args = args + [fld.data for fld in flds] + [out.data for out in outputs] + kwargs = dict(g_times_l=True) + evt = kernel(queue, gsize(), lsize(), *args, g_times_l=True) + + if check: + for out, ref in zip(outputs, ref_soln): + check_error(ref, out.get()[:Np]) + + return evt + + lp.drive_timing_run(kernel_gen, queue, launcher, num_flds*dim*2*(Np**2)*K) + + + + + +if __name__ == "__main__": + import sys + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) diff --git a/test/test_linalg.py b/test/test_linalg.py index b8c5cb7a1..83b13f74f 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -700,7 +700,7 @@ def test_fancy_matrix_mul(ctx_factory): knl = lp.make_kernel(ctx.devices[0], "[n] -> {[i,j,k]: 0<=i,j,k 1: diff --git a/test/test_matmul.py b/test/test_matmul.py deleted file mode 100644 index 4d111ae3c..000000000 --- a/test/test_matmul.py +++ /dev/null @@ -1,887 +0,0 @@ -from __future__ import division - -import numpy as np -import numpy.linalg as la -import pyopencl as cl -import pyopencl.array as cl_array -import pyopencl.clrandom as cl_random -import loopy as lp - -from pyopencl.tools import pytest_generate_tests_for_pyopencl \ - as pytest_generate_tests - - - - -def make_well_conditioned_dev_matrix(queue, shape, dtype=np.float32, - order="C", ran_factor=1, id_factor=5, inc_factor=0, od=0): - if isinstance(shape, int): - shape = (shape, shape) - l = max(shape) - eye_ish = id_factor*np.eye(l, k=od) - if inc_factor: - eye_ish[np.arange(l), np.arange(l)] = inc_factor*np.arange(l) - ary = np.asarray( - ran_factor*np.random.randn(*shape) - + eye_ish[:shape[0], :shape[1]], - dtype=dtype, order=order) - - return cl_array.to_device(queue, ary) - - - - -DO_CHECK = True - -DEBUG_PREAMBLE = r""" - #pragma OPENCL EXTENSION cl_amd_printf: enable - #define MY_J (j_outer*64+j_inner_outer*16+j_inner_inner) - #define MY_I (i_outer*16+i_inner) - #define IFDIAG if (MY_I == MY_J) - #define TST(S) if (MY_J == 144 && MY_I == 16-48) \ - for (int aa = 0; aa < 16: ++ab) \ - for (int bb = 0; bb < 16: ++bb) - """ - - - - -def check_error(refsol, sol): - if not DO_CHECK: - return - - if sol.shape == 2: - norm_order = "fro" - else: - norm_order = 2 - - rel_err = la.norm(refsol-sol, norm_order)/la.norm(refsol, norm_order) - if rel_err > 1e-5 or np.isinf(rel_err) or np.isnan(rel_err): - if 1: - import matplotlib.pyplot as pt - pt.imshow(refsol-sol) - pt.colorbar() - pt.show() - elif 0: - print "---------------------------" - print "ACTUAL" - print "---------------------------" - np.set_printoptions(threshold=1000000, linewidth=200) - print sol[:16,:16] - print "---------------------------" - print "CORRECT" - print "---------------------------" - print refsol[:16,:16] - raise RuntimeError("check failed, rel err=%g" % rel_err) - - - - -def get_suitable_size(ctx): - dev, = ctx.devices - if dev.type == cl.device_type.CPU: - return 160 - else: - return 1600 - - - - -def test_axpy(ctx_factory): - dtype = np.float32 - ctx = ctx_factory() - order = "C" - queue = cl.CommandQueue(ctx, - properties=cl.command_queue_properties.PROFILING_ENABLE) - - n = 20*1024**2 - - knl = lp.make_kernel(ctx.devices[0], - "[n] -> {[i]: 0<=i {[i,j,k]: 0<=i,j,k {[i,j]: 0<=i,j {[i,j,k]: 0<=i,j,k 1: - exec(sys.argv[1]) - else: - from py.test.cmdline import main - main([__file__]) -- GitLab