diff --git a/examples/dg.py b/examples/dg.py index 4e034c4f90daf1ecbf52c01ae18848153945eba6..85563ca1ff27d4bd875485f67ac99f1c67a72d8a 100644 --- a/examples/dg.py +++ b/examples/dg.py @@ -1,4 +1,4 @@ - +# FIXME NOT UPDATED YET FOR NEW-STYLE LOOPY! diff --git a/examples/quadrature.py b/examples/quadrature.py new file mode 100644 index 0000000000000000000000000000000000000000..9bf4e6f21f6b1ecd5b234dd31a0864dba372f09a --- /dev/null +++ b/examples/quadrature.py @@ -0,0 +1,107 @@ + +import numpy as np +import pyopencl as cl +import pyopencl.array as cl_array +import loopy as lp + + + + +def make_well_conditioned_dev_matrix(queue, shape, dtype=np.float32, + order="C", ran_factor=1, id_factor=5, inc_factor=0, od=0): + if isinstance(shape, int): + shape = (shape, shape) + l = max(shape) + eye_ish = id_factor*np.eye(l, k=od) + if inc_factor: + eye_ish[np.arange(l), np.arange(l)] = inc_factor*np.arange(l) + ary = np.asarray( + ran_factor*np.random.randn(*shape) + + eye_ish[:shape[0], :shape[1]], + dtype=dtype, order=order) + + return cl_array.to_device(queue, ary) + + + + +def build_mass_mat_maker(ctx_factory=cl.create_some_context): + dtype = np.float32 + ctx = ctx_factory() + queue = cl.CommandQueue(ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + + Nb = 3 + Nv = 3 + Nq = 3*3 + + Nc = 1600 + from pymbolic import var + m, w, det_j, phi, c, i, j, q = [var(s) for s in "m w det_j phi c i j q".split()] + + knl = lp.LoopKernel(ctx.devices[0], + "[ncells] -> {[c,i,j,q]: 0<=c<ncells and 0 <= i < %(Nv)s " + "and 0<=j<%(Nb)s and 0<=q<%(Nq)s}" % dict( + Nv=Nv, Nb=Nb, Nq=Nq), + [ + (m[c, i, j], w[q]*det_j[c]*phi[i,q]*phi[j,q]) + ], + [ + lp.ArrayArg("m", dtype, shape=(Nc, Nv, Nb)), + lp.ArrayArg("w", dtype, shape=(Nq,)), + lp.ArrayArg("det_j", dtype, shape=(Nc,)), + lp.ArrayArg("phi", dtype, shape=(Nv, Nq,)), + lp.ScalarArg("ncells", np.int32, approximately=1000), + ], + name="mass_mat", + iname_to_tag=dict(i="l.0", j="l.1") + ) + knl = lp.split_dimension(knl, "c", 8, outer_tag="g.0", inner_tag="l.2") + knl = lp.add_prefetch(knl, "det_j", ["c_inner"]) + + # fix reg prefetch + # fix redundant slab generation + + # FIXME + #knl = lp.split_dimension(knl, "c", 8, inner_tag="l.2") + #knl = lp.split_dimension(knl, "c_outer", 8, outer_tag="g.0") + + #ilp = 4 + #knl = lp.split_dimension(knl, "i", 2, outer_tag="g.0", inner_tag="l.1") + #j_inner_split = 16 + #knl = lp.split_dimension(knl, "j", ilp*j_inner_split, outer_tag="g.1") + #knl = lp.split_dimension(knl, "j_inner", j_inner_split, outer_tag="ilp", inner_tag="l.0") + #knl = lp.split_dimension(knl, "k", 2) + + #knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) + #knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"]) + #assert knl.get_problems({})[0] <= 2 + + kernel_gen = (lp.insert_register_prefetches(knl) + for knl in lp.generate_loop_schedules(knl)) + + if False: + a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order, + ran_factor=1, id_factor=5) + b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order, + ran_factor=1, id_factor=5, inc_factor=0) + c = cl_array.empty_like(a) + a_img = cl.image_from_array(ctx, a.get(), 1) + b_img = cl.image_from_array(ctx, b.get(), 1) + + def launcher(kernel, gsize, lsize, check): + 1/0 + evt = kernel(queue, gsize(), lsize(), a_img, b_img, c.data, + g_times_l=True) + + return evt + + from pyopencl.characterize import get_fast_inaccurate_build_options + lp.drive_timing_run(kernel_gen, queue, launcher, flop_count=0, + options=get_fast_inaccurate_build_options(ctx.devices[0])) + + + + +if __name__ == "__main__": + build_mass_mat_maker() diff --git a/loopy/__init__.py b/loopy/__init__.py index 390f9996e36e4b4195eb167e30e430512c6fc623..147efabf99ab20e57ee88c19cfc1bc06c9f96653 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -31,6 +31,9 @@ register_mpz_with_pymbolic() # TODO: Implement GT200 matmul, Fermi matmul, DG # TODO: DMA engine threads? +# TODO: Specify initial implemented domain. +# (to filter away unnecessary conditions on parameters) +# TODO: Deal with equalities that crop up. # Later: # ------ diff --git a/loopy/kernel.py b/loopy/kernel.py index e5be621b01e13fb77389787bed1fc6788bbe3b2d..6eb67fde0b1dd257ee2b98f94fbee476e4d090d4 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -488,7 +488,7 @@ class LoopKernel(Record): for i in range(len(llens)): if llens[i] > self.device.max_work_item_sizes[i]: - msg(5, "group axis %d too big") + msg(5, "group axis %d too big" % i) from pytools import product if product(llens) > self.device.max_work_group_size: diff --git a/test/test_matmul.py b/test/test_matmul.py index 1c6210f0d9c5c5e10636ae718ae04707c474693b..7aa72e646ac2aa800e390ee32245e103b25d24e4 100644 --- a/test/test_matmul.py +++ b/test/test_matmul.py @@ -127,10 +127,9 @@ def test_axpy(ctx_factory): #check_error(refsol, c.get()) #return evt - 1/0 + pass - lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3, - edit_code=True) + lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3)