diff --git a/test/gnuma_loopy_transforms.py b/test/gnuma_loopy_transforms.py index f741072c9add533cf6501faf087f5cf4a8b20b52..2107d50b03bdf99dad1b8c5bd7f31dfa7062579c 100644 --- a/test/gnuma_loopy_transforms.py +++ b/test/gnuma_loopy_transforms.py @@ -32,10 +32,8 @@ def set_q_storage_format(kernel, name): return kernel -def prefetch_and_set_D_storage_format(kernel): - kernel = lp.tag_data_axes(kernel, "D", "f,f") - kernel = lp.add_prefetch(kernel, "D[:,:]") - return kernel +def set_D_storage_format(kernel): + return lp.tag_data_axes(kernel, "D", "f,f") def set_up_volume_loop(kernel, Nq): diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 3f162af5bbc66fb308440d29f1769b3caff5713d..4350a3878469cb7ccd16ba5f0dda2d287d7f2136 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -2,7 +2,7 @@ from __future__ import division -__copyright__ = "Copyright (C) 2015 Andreas Kloeckner" +__copyright__ = "Copyright (C) 2015 Andreas Kloeckner, Lucas Wilcox" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy @@ -26,15 +26,27 @@ THE SOFTWARE. import pytest import loopy as lp -import pyopencl as cl # noqa +import pyopencl as cl import sys pytestmark = pytest.mark.importorskip("fparser") +import logging +logger = logging.getLogger(__name__) + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + +__all__ = [ + "pytest_generate_tests", + "cl" # 'cl.create_some_context' + ] + @pytest.mark.parametrize("Nq", [7]) @pytest.mark.parametrize("ilp_multiple", [1, 2]) -def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq): +@pytest.mark.parametrize("opt_level", [11]) +def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): ctx = ctx_factory() filename = "strongVolumeKernels.f90" @@ -54,8 +66,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq): from gnuma_loopy_transforms import ( fix_euler_parameters, - set_q_storage_format, - prefetch_and_set_D_storage_format) + set_q_storage_format, set_D_storage_format) hsv = lp.fix_parameters(hsv, Nq=Nq) hsv = lp.set_loop_priority(hsv, "e,k,j,i") @@ -66,11 +77,19 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq): for name in ["Q", "rhsQ"]: hsv = set_q_storage_format(hsv, name) - hsv = prefetch_and_set_D_storage_format(hsv) + hsv = set_D_storage_format(hsv) #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors") ref_hsv = hsv + if opt_level == 0: + tap_hsv = hsv + + hsv = lp.add_prefetch(hsv, "D[:,:]") + + if opt_level == 1: + tap_hsv = hsv + # turn the first reads into subst rules local_prep_var_names = set() for insn in lp.find_instructions(hsv, "tag:local_prep"): @@ -85,7 +104,6 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq): r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl") s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl") - ilp_multiple = 1 if ilp_multiple > 1: hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp") ilp_inames = ("k_inner",) @@ -139,17 +157,38 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq): hsv = lp.alias_temporaries(hsv, rtmps) hsv = lp.alias_temporaries(hsv, stmps) + if opt_level == 2: + tap_hsv = hsv + for prep_var_name in local_prep_var_names: if prep_var_name.startswith("Jinv") or "_s" in prep_var_name: continue hsv = lp.precompute(hsv, lp.find_one_rule_matching(hsv, prep_var_name+"_*subst*")) + if opt_level == 3: + tap_hsv = hsv + hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames) + + if opt_level == 4: + tap_hsv = hsv + tap_hsv = lp.tag_inames(tap_hsv, dict( + Q_dim_field_inner="unr", + Q_dim_field_outer="unr")) + hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames, fetch_bounding_box=True, default_tag="for", init_expression="0") + if opt_level == 5: + tap_hsv = hsv + tap_hsv = lp.tag_inames(tap_hsv, dict( + rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", + rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", + Q_dim_field_inner="unr", + Q_dim_field_outer="unr")) + # buffer axes need to be vectorized in order for this to work hsv = lp.tag_data_axes(hsv, "rhsQ_buf", "c?,vec,c") hsv = lp.tag_data_axes(hsv, "Q_fetch", "c?,vec,c") @@ -157,15 +196,33 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq): hsv = lp.tag_inames(hsv, {"Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr"}, ignore_nonexistent=True) + + if opt_level == 6: + tap_hsv = hsv + tap_hsv = lp.tag_inames(tap_hsv, dict( + rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", + rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", + Q_dim_field_inner="unr", + Q_dim_field_outer="unr")) + hsv = lp.tag_inames(hsv, dict( rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="vec", Q_dim_field_outer="unr")) + + if opt_level == 7: + tap_hsv = hsv + hsv = lp.collect_common_factors_on_increment(hsv, "rhsQ_buf", vary_by_axes=(0,) if ilp_multiple > 1 else ()) - if 0: + if opt_level >= 8: + tap_hsv = hsv + + hsv = tap_hsv + + if 1: print("OPS") op_poly = lp.get_op_poly(hsv) print(lp.stringify_stats_mapping(op_poly)) @@ -174,19 +231,22 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq): gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv)) print(lp.stringify_stats_mapping(gmem_poly)) - # FIXME - if 0: - hsv = lp.set_options(hsv, cl_build_options=[ - "-cl-no-signed-zeros", - "-cl-fast-relaxed-math", - "-cl-mad-enable", - "-cl-uniform-work-group-size", - ]) + hsv = lp.set_options(hsv, cl_build_options=[ + "-cl-denorms-are-zero", + "-cl-fast-relaxed-math", + "-cl-finite-math-only", + "-cl-mad-enable", + "-cl-no-signed-zeros", + ]) hsv = hsv.copy(name="horizontalStrongVolumeKernel") - lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), - do_check=False) + results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), + do_check=False, quiet=True) + + elapsed = results["elapsed_wall"] + + print("elapsed", elapsed) if __name__ == "__main__":