From c12c610978b2b1ecab1a6b619f64315b241bfa0e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni <kaushikcfd@gmail.com> Date: Tue, 20 Nov 2018 12:45:04 -0600 Subject: [PATCH] Merge 'master' into 'new_function_interface' --- .gitlab-ci.yml | 19 ++++++++++- LICENSE | 21 ++++++++++++ .../make-linux-build-docker-inner-part-2.sh | 4 +++ loopy/frontend/fortran/tree.py | 2 +- loopy/kernel/tools.py | 4 +-- loopy/schedule/__init__.py | 10 ++++-- loopy/statistics.py | 20 ++++++++---- loopy/symbolic.py | 2 +- loopy/target/cuda.py | 2 +- loopy/target/pyopencl.py | 3 +- requirements.txt | 5 +-- setup.cfg | 2 +- test/test_loopy.py | 19 +++++++++++ test/test_numa_diff.py | 2 +- test/test_reduction.py | 32 +++++++++++-------- test/test_statistics.py | 14 +++++--- test/test_target.py | 17 ++++++++++ 17 files changed, 137 insertions(+), 41 deletions(-) create mode 100644 LICENSE diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1caef802b..ea69114d6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -12,6 +12,10 @@ Python 2.7 POCL: - pocl except: - tags + artifacts: + reports: + junit: test/pytest.xml + Python 2.7 with legacy PyOpenCL: script: @@ -29,6 +33,10 @@ Python 2.7 with legacy PyOpenCL: except: - tags retry: 2 + artifacts: + reports: + junit: test/pytest.xml + Python 3.6 POCL: script: @@ -43,6 +51,10 @@ Python 3.6 POCL: - pocl except: - tags + artifacts: + reports: + junit: test/pytest.xml + Python 3.6 POCL Twice With Cache: script: @@ -59,6 +71,10 @@ Python 3.6 POCL Twice With Cache: - pocl except: - tags + artifacts: + reports: + junit: test/pytest.xml + # PyPy POCL: # script: @@ -77,7 +93,7 @@ Python 3.6 POCL Examples: script: - export PY_EXE=python3.6 - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib jupyter nbconvert" + - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert" - ". ./build-py-project-and-run-examples.sh" tags: - python3.6 @@ -87,6 +103,7 @@ Python 3.6 POCL Examples: except: - tags + CentOS binary: script: - (cd build-helpers; ./make-linux-build-docker.sh --nodate) diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..601df74bd --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Andreas Klöckner and contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/build-helpers/make-linux-build-docker-inner-part-2.sh b/build-helpers/make-linux-build-docker-inner-part-2.sh index 1e35a1e1b..035634b16 100755 --- a/build-helpers/make-linux-build-docker-inner-part-2.sh +++ b/build-helpers/make-linux-build-docker-inner-part-2.sh @@ -23,6 +23,10 @@ git clone --recursive git://github.com/inducer/loopy cd loopy grep -v pyopencl requirements.txt > myreq.txt + +# needed for pyinstaller package to be usable +echo packaging >> myreq.txt + pip install -r myreq.txt python setup.py install diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py index b1df6e3d0..6939bb6ad 100644 --- a/loopy/frontend/fortran/tree.py +++ b/loopy/frontend/fortran/tree.py @@ -53,7 +53,7 @@ class FTreeWalkerBase(object): ENTITY_RE = re.compile( r"^(?P<name>[_0-9a-zA-Z]+)" - "(\((?P<shape>[-+*0-9:a-zA-Z, \t]+)\))?$") + r"(\((?P<shape>[-+*0-9:a-zA-Z, \t]+)\))?$") def parse_dimension_specs(self, node, dim_decls): def parse_bounds(bounds_str): diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 006ac6ba3..3aaa8d56a 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1253,7 +1253,7 @@ def draw_dependencies_as_unicode_arrows( for dep in insn.depends_on: reverse_deps.setdefault(dep, set()).add(insn.id) - # mapping of (from_id, to_id) tuples to column_index + # mapping of to_id tuples to column_index dep_to_column = {} # {{{ find column assignments @@ -1330,7 +1330,7 @@ def draw_dependencies_as_unicode_arrows( elif insn.id in starts: starts.remove(insn.id) - if starts: + if starts or pointed_at_insn_id not in processed_ids: # will continue downward row[col] = do_flag_downward(u"├", pointed_at_insn_id) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 2b3f7a3b9..3dc1c0bbe 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -794,9 +794,13 @@ def generate_loop_schedules_internal( if not is_ready: if debug_mode: - print("instruction '%s' is missing insn depedencies '%s'" % ( - format_insn(kernel, insn.id), ",".join( - insn.depends_on - sched_state.scheduled_insn_ids))) + # These are not that interesting when understanding scheduler + # failures. + + # print("instruction '%s' is missing insn depedencies '%s'" % ( + # format_insn(kernel, insn.id), ",".join( + # insn.depends_on - sched_state.scheduled_insn_ids))) + pass continue want = kernel.insn_inames(insn) - sched_state.parallel_inames diff --git a/loopy/statistics.py b/loopy/statistics.py index d65387d16..454cca18e 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -707,9 +707,10 @@ class CounterBase(CombineMapper): # {{{ ExpressionOpCounter class ExpressionOpCounter(CounterBase): - def __init__(self, knl, callables_table): + def __init__(self, knl, callables_table, count_within_subscripts=True): self.knl = knl self.callables_table = callables_table + self.count_within_subscripts = count_within_subscripts from loopy.type_inference import TypeInferenceMapper self.type_inf = TypeInferenceMapper(knl, callables_table) @@ -737,7 +738,10 @@ class ExpressionOpCounter(CounterBase): ) + self.rec(expr.parameters) def map_subscript(self, expr): - return self.rec(expr.index) + if self.count_within_subscripts: + return self.rec(expr.index) + else: + return ToCountMap() def map_sum(self, expr): assert expr.children @@ -1343,10 +1347,9 @@ def _get_insn_count(knl, callables_table, insn_id, subgroup_size, # {{{ get_op_map - def get_op_map_for_single_kernel(knl, callables_table, numpy_types=True, count_redundant_work=False, - subgroup_size=None): + count_within_subscripts=True, subgroup_size=None): if not knl.options.ignore_boostable_into: raise LoopyError("Kernel '%s': Using operation counting requires the option " @@ -1394,7 +1397,7 @@ def get_op_map_for_single_kernel(knl, callables_table, def get_op_map(program, numpy_types=True, count_redundant_work=False, - subgroup_size=None): + count_within_subscripts=True, subgroup_size=None): """Count the number of operations in a loopy kernel. @@ -1410,6 +1413,9 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, (Likely desirable for performance modeling, but undesirable for code optimization.) + :arg count_within_subscripts: A :class:`bool` specifying whether to + count operations inside array indices. + :arg subgroup_size: (currently unused) An :class:`int`, :class:`str` ``'guess'``, or *None* that specifies the sub-group size. An OpenCL sub-group is an implementation-dependent grouping of work-items within @@ -1464,8 +1470,8 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, if isinstance(in_knl_callable, CallableKernel): knl = in_knl_callable.subkernel knl_op_map = get_op_map_for_single_kernel(knl, - program.callables_table, numpy_types, - count_redundant_work, subgroup_size) + program.callables_table, numpy_types, count_redundant_work, + count_within_subscripts, subgroup_size) for i in range(callables_count[func_id]): op_map += knl_op_map diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 92b209ac9..04cf2d02b 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1696,7 +1696,7 @@ def get_access_range(domain, subscript, assumptions, shape=None, if shape is not None: try: shape_aff = guarded_aff_from_expr(access_map.space, shape[idim]) - except ExpressionToAffineConversionError as sub_err: + except ExpressionToAffineConversionError: pass if shape_aff is None: diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 32b810eb3..6b4385bff 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -344,7 +344,7 @@ class CUDACASTBuilder(CASTBuilder): _VEC_AXES = "xyzw" def add_vector_access(self, access_expr, index): - return access_expr.a(self._VEC_AXES[index]) + return access_expr.attr(self._VEC_AXES[index]) def emit_barrier(self, synchronization_kind, mem_kind, comment): """ diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index d98b6cdd6..5ef564572 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -125,7 +125,8 @@ def adjust_local_temp_var_storage(kernel, device): new_storage_shape = storage_shape - new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=new_storage_shape) + new_temp_vars[temp_var.name] = temp_var.copy( + storage_shape=tuple(new_storage_shape)) return kernel.copy(temporary_variables=new_temp_vars) diff --git a/requirements.txt b/requirements.txt index a3e88cfea..97c202476 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,4 @@ git+https://github.com/inducer/codepy.git git+https://github.com/inducer/f2py # Optional, needed for using the C preprocessor on Fortran -ply>=3.6 - -# This is needed for the pyinstaller executable to be usable. -packaging +ply>=3.6 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index b939ce0cf..eec3dfd1f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [flake8] -ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814 +ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814,W504 max-line-length=85 exclude= loopy/target/c/compyte/ndarray, diff --git a/test/test_loopy.py b/test/test_loopy.py index fa32ca04c..b770497f1 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2890,6 +2890,25 @@ def test_dep_cycle_printing_and_error(): print(lp.generate_code(knl).device_code()) +def test_backwards_dep_printing_and_error(): + knl = lp.make_kernel( + "{[i]: 0<=i<n}", + """ + c[i] = a[i] + b[i] {id=insn1} + c[i] = 2*c[i] {id=insn2, dep=insn1} + c[i] = 7*c[i] + a[i]*a[i] + b[i]*b[i] {id=insn3, dep=insn2} + b[i] = b[i] + c[i] {id=insn4, dep=insn3} + d[i] = 7*a[i ] {id=insn5, dep=insn4} + a[i] = a[i] + d[i] {id=insn6, dep=insn5} + """, [ + lp.GlobalArg('a, b', dtype=np.float64), + "..." + ]) + + # Used to crash with KeyError + print(knl) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 62f490cee..1ba44e77e 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -47,8 +47,8 @@ __all__ = [ from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa -@pytest.mark.parametrize("Nq", [7]) @pytest.mark.parametrize("ilp_multiple", [1, 2]) +@pytest.mark.parametrize("Nq", [7]) @pytest.mark.parametrize("opt_level", [11]) def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa ctx = ctx_factory() diff --git a/test/test_reduction.py b/test/test_reduction.py index 96dab405a..aaf11ee29 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -219,32 +219,38 @@ def test_local_parallel_reduction(ctx_factory, size): def test_global_parallel_reduction(ctx_factory, size): ctx = ctx_factory() - prog = lp.make_kernel( + knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ # Using z[0] instead of z works around a bug in ancient PyOpenCL. - z[0] = sum(i, i/13) + z[0] = sum(i, a[i]) """) - ref_prog = prog + knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) + ref_knl = knl gsize = 128 - prog = lp.split_iname(prog, "i", gsize * 20) - prog = lp.split_iname(prog, "i_inner", gsize, outer_tag="l.0") - prog = lp.split_reduction_inward(prog, "i_inner_inner") - prog = lp.split_reduction_inward(prog, "i_inner_outer") + knl = lp.split_iname(knl, "i", gsize * 20) + knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0") + knl = lp.split_reduction_outward(knl, "i_outer") + knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule - prog = reduction_arg_to_subst_rule(prog, "i_outer") - prog = lp.precompute(prog, "red_i_outer_arg", "i_outer", + knl = reduction_arg_to_subst_rule(knl, "i_outer") + + knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL, default_tag="l.auto") - prog = lp.realize_reduction(prog) - prog = lp.add_dependency( - prog, "writes:acc_i_outer", + knl = lp.realize_reduction(knl) + knl = lp.tag_inames(knl, "i_outer_0:g.0") + + # Keep the i_outer accumulator on the correct (lower) side of the barrier, + # otherwise there will be useless save/reload code generated. + knl = lp.add_dependency( + knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref( - ref_prog, ctx, prog, parameters={"n": size}, + ref_knl, ctx, knl, parameters={"n": size}, print_ref_code=True) diff --git a/test/test_statistics.py b/test/test_statistics.py index 3f2366521..41b44b5a7 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -57,7 +57,8 @@ def test_op_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) @@ -161,7 +162,8 @@ def test_op_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) @@ -206,7 +208,8 @@ def test_op_counter_bitwise(): a=np.int32, b=np.int32, g=np.int64, h=np.int64)) - op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=False) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) @@ -226,7 +229,7 @@ def test_op_counter_bitwise(): i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups - assert i32add == n*m+n*m*ell*n_subgroups + assert i32add == n*m*ell*n_subgroups assert i32bw == 2*n*m*ell*n_subgroups assert i64bw == 2*n*m*n_subgroups assert i64add == i64mul == n*m*n_subgroups @@ -1153,7 +1156,8 @@ def test_summations_and_filters(): assert f32lall == (3*n*m*ell)*n_subgroups assert f64lall == (2*n*m)*n_subgroups - op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) #for k, v in op_map.items(): # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v) diff --git a/test/test_target.py b/test/test_target.py index a5186c71c..095bf0939 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -350,6 +350,23 @@ def test_ispc_streaming_stores(): lp.generate_code_v2(knl).all_code() +def test_cuda_short_vector(): + knl = lp.make_kernel( + "{ [i]: 0<=i<n }", + "out[i] = 2*a[i]", + target=lp.CudaTarget()) + + knl = lp.set_options(knl, write_code=True) + knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec") + knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4) + knl = lp.tag_array_axes(knl, "a,out", "C,vec") + + knl = lp.set_options(knl, write_wrapper=True) + knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) + + print(lp.generate_code_v2(knl).device_code()) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab