diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..d0481192817877edea3b8deaaaf86b480fab2a11 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,63 @@ +name: CI +on: + push: + branches: + - master + pull_request: + paths-ignore: + - 'doc/*.rst' + schedule: + - cron: '17 3 * * 0' + +jobs: + flake8: + name: Flake8 + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - + uses: actions/setup-python@v1 + with: + python-version: '3.x' + - name: "Main Script" + run: | + curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh + . ./prepare-and-run-flake8.sh ./loopy ./test + + pylint: + name: Pylint + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: "Main Script" + run: | + sed 's/python=3/python=3.7/' .test-conda-env-py3.yml > .test-conda-env.yml + CONDA_ENVIRONMENT=.test-conda-env.yml + USE_CONDA_BUILD=1 + curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh + . ./prepare-and-run-pylint.sh loopy test/test_*.py + + pytest3: + name: Conda Pytest Py3 + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: "Main Script" + run: | + CONDA_ENVIRONMENT=.test-conda-env-py3.yml + curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh + . ./build-and-test-py-project-within-miniconda.sh + + pytest_twice: + name: Pytest twice (for cache behavior) on Py${{ matrix.python-version }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: "Main Script" + run: | + CONDA_ENVIRONMENT=.test-conda-env-py3.yml + curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh + . ./build-and-test-py-project-within-miniconda.sh + ${PY_EXE} -m pytest -rw --durations=10 --tb=native --junitxml=pytest.xml -rxs $TESTABLES + +# vim: sw=4 diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c11e507ee79cdc6f1567acbf6c12bbd7ed22f1cc..48bee8638df08ebe8c03a17f84c78851ff36466e 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,26 +1,7 @@ -Python 2.7 POCL: - script: - - export PY_EXE=python2.7 - - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="pybind11 numpy mako" - - export LOOPY_NO_CACHE=1 - - export NO_DOCTESTS=1 - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - - ". ./build-and-test-py-project.sh" - tags: - - python2.7 - - pocl - except: - - tags - artifacts: - reports: - junit: test/pytest.xml - - Python 3 POCL: script: - export PY_EXE=python3 - - export PYOPENCL_TEST=portable + - export PYOPENCL_TEST=portable:pthread - export EXTRA_INSTALL="pybind11 numpy mako" - export LOOPY_NO_CACHE=1 - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh @@ -57,7 +38,7 @@ Python 3 Intel: Python 3 POCL Twice With Cache: script: - export PY_EXE=python3 - - export PYOPENCL_TEST=portable + - export PYOPENCL_TEST=portable:pthread - export EXTRA_INSTALL="pybind11 numpy mako" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" @@ -77,7 +58,7 @@ Python 3 POCL Twice With Cache: # PyPy POCL: # script: # - export PY_EXE=pypy -# - export PYOPENCL_TEST=portable +# - export PYOPENCL_TEST=portable:pthread # - export EXTRA_INSTALL="pybind11 numpy mako" # - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh # - ". ./build-and-test-py-project.sh" @@ -90,7 +71,7 @@ Python 3 POCL Twice With Cache: Python 3 POCL Examples: script: - export PY_EXE=python3 - - export PYOPENCL_TEST=portable + - export PYOPENCL_TEST=portable:pthread - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert" - ". ./build-py-project-and-run-examples.sh" tags: @@ -114,20 +95,6 @@ Pylint: except: - tags -CentOS binary: - script: - - (cd build-helpers; ./make-linux-build-docker.sh --nodate) - - (cd ./build-helpers; ./loopy-centos6 ../examples/fortran/sparse.floopy) - artifacts: - expire_in: 4 weeks - paths: - - build-helpers/loopy-centos6 - tags: - - docker - only: - - master - retry: 2 - Documentation: script: - EXTRA_INSTALL="pybind11 numpy" diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml index a1fe086b4ac4562aaa8fafd32657aebbd1068e8a..ccbbc933aae2d3c0a28d7d30f178661950c76542 100644 --- a/.test-conda-env-py3.yml +++ b/.test-conda-env-py3.yml @@ -1,12 +1,12 @@ name: test-conda-env channels: - conda-forge -- defaults +- nodefaults dependencies: - python=3 - git -- conda-forge::numpy +- numpy - pocl - mako - pyopencl diff --git a/README.rst b/README.rst index fe7eb751a7144d9758df91914b643392de421450..3240983638e1f6f96ba7fec410c5c893db19c044 100644 --- a/README.rst +++ b/README.rst @@ -4,9 +4,9 @@ Loopy: Transformation-Based Generation of High-Performance CPU/GPU Code .. image:: https://gitlab.tiker.net/inducer/loopy/badges/master/pipeline.svg :alt: Gitlab Build Status :target: https://gitlab.tiker.net/inducer/loopy/commits/master -.. image:: https://dev.azure.com/ak-spam/inducer/_apis/build/status/inducer.loopy?branchName=master - :alt: Azure Build Status - :target: https://dev.azure.com/ak-spam/inducer/_build/latest?definitionId=10&branchName=master +.. image:: https://github.com/inducer/loopy/workflows/CI/badge.svg?branch=master&event=push + :alt: Github Build Status + :target: https://github.com/inducer/loopy/actions?query=branch%3Amaster+workflow%3ACI+event%3Apush .. image:: https://badge.fury.io/py/loo.py.png :alt: Python Package Index Release Page :target: https://pypi.org/project/loo.py/ diff --git a/azure-pipelines.yml b/azure-pipelines.yml deleted file mode 100644 index 0dfb2455568b275b40e699683071da3a1cd2f483..0000000000000000000000000000000000000000 --- a/azure-pipelines.yml +++ /dev/null @@ -1,114 +0,0 @@ -jobs: -- - job: 'Python2' - pool: - vmImage: 'ubuntu-latest' - - steps: - - - script: | - set -e - sed 's/python=3/python=2.7/' .test-conda-env-py3.yml > .test-conda-env-py2.yml - cat .test-conda-env-py2.yml - CONDA_ENVIRONMENT=.test-conda-env-py2.yml - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh - . ./build-and-test-py-project-within-miniconda.sh - - displayName: 'Pytest Conda' - - - task: PublishTestResults@2 - inputs: - testResultsFormat: 'JUnit' - testResultsFiles: 'test/pytest.xml' - -- - job: 'Python3' - pool: - vmImage: 'ubuntu-latest' - - steps: - - - script: | - set -e - CONDA_ENVIRONMENT=.test-conda-env-py3.yml - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh - . ./build-and-test-py-project-within-miniconda.sh - - displayName: 'Pytest Conda' - - - - task: PublishTestResults@2 - inputs: - testResultsFormat: 'JUnit' - testResultsFiles: 'test/pytest.xml' - -- - job: 'Python3Twice' - displayName: "Python3 - run tests twice to test cache behavior" - pool: - vmImage: 'ubuntu-latest' - - steps: - - - script: | - set -e - CONDA_ENVIRONMENT=.test-conda-env-py3.yml - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh - . ./build-and-test-py-project-within-miniconda.sh - ${PY_EXE} -m pytest -rw --durations=10 --tb=native --junitxml=pytest.xml -rxs $TESTABLES - - displayName: 'Pytest Conda' - - - - task: PublishTestResults@2 - inputs: - testResultsFormat: 'JUnit' - testResultsFiles: 'test/pytest.xml' - -- - job: 'Flake8' - pool: - vmImage: 'ubuntu-latest' - strategy: - matrix: - Python37: - python.version: '3.7' - - steps: - - - task: UsePythonVersion@0 - inputs: - versionSpec: '$(python.version)' - - - - script: | - set -e - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh - . ./prepare-and-run-flake8.sh loopy test - - displayName: 'Flake8' - -- - job: 'Pylint' - pool: - vmImage: 'ubuntu-latest' - - steps: - - - script: | - set -e - sed 's/python=3/python=3.7/' .test-conda-env-py3.yml > .test-conda-env.yml - CONDA_ENVIRONMENT=.test-conda-env.yml - USE_CONDA_BUILD=1 - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh - . ./prepare-and-run-pylint.sh loopy test/test_*.py - - displayName: 'Pylint' - -schedules: -- - cron: "0 0 * * 0" - displayName: Weekly build - branches: - include: - - master diff --git a/build-helpers/.gitignore b/build-helpers/.gitignore deleted file mode 100644 index fef83014eecb14936006b90afc65595dd7d30b77..0000000000000000000000000000000000000000 --- a/build-helpers/.gitignore +++ /dev/null @@ -1 +0,0 @@ -loopy-*-20[0-9][0-9]* diff --git a/build-helpers/loopy.spec b/build-helpers/loopy.spec deleted file mode 100644 index 08c0b6efe0efd3ad419b6565fd396c2f805eeab7..0000000000000000000000000000000000000000 --- a/build-helpers/loopy.spec +++ /dev/null @@ -1,70 +0,0 @@ -# -*- mode: python -*- - -from os.path import basename, dirname, join -from glob import glob - -single_file = True - -# This makes the executable spew debug info. -debug = False - -from os.path import expanduser - -import packaging # pip install packaging to add - -a = Analysis(['../bin/loopy'], - pathex=[expanduser('~/src/loopy')], - hiddenimports=[ - "decorator", - "appdirs", - "packaging.markers", - "packaging.specifiers", - "packaging.version", - "packaging.requirements", - ], - hookspath=None, - runtime_hooks=None, - excludes=["hedge", "meshpy", "pyopencl", "PIL"] - ) - -import ply.lex -import ply.yacc - - -a.datas += [ - (join("py-src", "ply", "lex", basename(fn)), fn, "DATA") - for fn in glob(join(dirname(ply.lex.__file__), "*.py")) - ] + [ - (join("py-src", "ply", "yacc", basename(fn)), fn, "DATA") - for fn in glob(join(dirname(ply.yacc.__file__), "*.py")) - ] - -pyz = PYZ(a.pure) - -if single_file: - exe = EXE(pyz, - a.scripts, - a.binaries, - a.zipfiles, - a.datas, - name='loopy', - debug=debug, - strip=None, - upx=True, - console=True) -else: - exe = EXE(pyz, - a.scripts, - exclude_binaries=True, - name='loopy', - debug=debug, - strip=None, - upx=True, - console=True) - coll = COLLECT(exe, - a.binaries, - a.zipfiles, - a.datas, - strip=None, - upx=True, - name='loopy') diff --git a/build-helpers/make-linux-build-docker-inner-part-2.sh b/build-helpers/make-linux-build-docker-inner-part-2.sh deleted file mode 100755 index 035634b16072e0188270abd8736dab99ce31dada..0000000000000000000000000000000000000000 --- a/build-helpers/make-linux-build-docker-inner-part-2.sh +++ /dev/null @@ -1,35 +0,0 @@ -#! /bin/bash - -set -e -set -x - -VENV_VERSION="virtualenv-15.2.0" -rm -Rf "$VENV_VERSION" -curl -k https://files.pythonhosted.org/packages/b1/72/2d70c5a1de409ceb3a27ff2ec007ecdd5cc52239e7c74990e32af57affe9/$VENV_VERSION.tar.gz | tar xfz - - -$VENV_VERSION/virtualenv.py --system-site-packages --no-setuptools .env - -source .env/bin/activate - -curl -k https://bootstrap.pypa.io/ez_setup.py | python - -curl -k https://gitlab.tiker.net/inducer/pip/raw/7.0.3/contrib/get-pip.py | python - - -pip install packaging - -PYTHON_VER=$(python -c 'import sys; print(".".join(str(s) for s in sys.version_info[:2]))') -pip install git+https://github.com/pyinstaller/pyinstaller.git@413c37bec126c0bd26084813593f65128966b4b7 - -git clone --recursive git://github.com/inducer/loopy -cd loopy - -grep -v pyopencl requirements.txt > myreq.txt - -# needed for pyinstaller package to be usable -echo packaging >> myreq.txt - -pip install -r myreq.txt -python setup.py install - -chown -R user /tmp/build - -su user -p -c "cd /tmp/build && source .env/bin/activate && cd loopy && ./build-helpers/run-pyinstaller.sh" diff --git a/build-helpers/make-linux-build-docker-inner.sh b/build-helpers/make-linux-build-docker-inner.sh deleted file mode 100755 index a7f621b1ef21676898d2283d93f8a54f086e5d9d..0000000000000000000000000000000000000000 --- a/build-helpers/make-linux-build-docker-inner.sh +++ /dev/null @@ -1,15 +0,0 @@ -#! /bin/bash - -set -e -set -x - -mkdir /tmp/build -cd /tmp/build - -useradd -d /home/user -m -s /bin/bash user - -yum install -y centos-release-scl -yum install -y git python27 python27-python-devel python27-numpy tar gcc gcc-c++ mercurial libffi-devel - -scl enable python27 /mnt/make-linux-build-docker-inner-part-2.sh - diff --git a/build-helpers/make-linux-build-docker.sh b/build-helpers/make-linux-build-docker.sh deleted file mode 100755 index fb0cfb587d654698800bfdc827259691bc056fb7..0000000000000000000000000000000000000000 --- a/build-helpers/make-linux-build-docker.sh +++ /dev/null @@ -1,28 +0,0 @@ -#! /bin/bash - -# should be run in this directory (build-helpers) - -if test "$1" = "--nodate"; then - TGT_NAME=loopy-centos6 -else - TGT_NAME=loopy-centos6-$(date +"%Y-%m-%d") -fi - -echo "Generating $TGT_NAME..." - -set -e -set -x - -docker pull centos:6 - -CNT=$(docker create -t -v $(pwd):/mnt centos:6 /mnt/make-linux-build-docker-inner.sh) -echo "working in container $CNT" - -docker start -i $CNT - -docker cp $CNT:/tmp/build/loopy/dist/loopy $(pwd) || true - -mv loopy $TGT_NAME - -docker rm $CNT - diff --git a/build-helpers/run-pyinstaller.sh b/build-helpers/run-pyinstaller.sh deleted file mode 100755 index 50f9d85dccc503be2a2ccfb6c0e3d6aa28216981..0000000000000000000000000000000000000000 --- a/build-helpers/run-pyinstaller.sh +++ /dev/null @@ -1,9 +0,0 @@ -#! /bin/bash - -# run this from the loopy root directory - -rm -Rf dist build - -pyinstaller \ - --workpath=build/pyinstaller \ - build-helpers/loopy.spec diff --git a/build-helpers/upload.sh b/build-helpers/upload.sh deleted file mode 100755 index 57b8a873b9395954d76a8fd16f8ca9a261e8baa3..0000000000000000000000000000000000000000 --- a/build-helpers/upload.sh +++ /dev/null @@ -1,5 +0,0 @@ -#! /bin/bash - -set -e - -scp "$1" tiker.net:public_html/pub/loopy-binaries/ diff --git a/doc/index.rst b/doc/index.rst index b77bbb16f413defe5010c75d28464051553b4486..8f114eb72cdc530dd4109257c4981118c5046f06 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -20,29 +20,6 @@ When you run this script, the following kernel is generated, compiled, and execu .. _static-binary: -Want to try out loopy? ----------------------- - -There's no need to go through :ref:`installation` if you'd just like to get a -feel for what loopy is. Instead, you may -`download a self-contained Linux binary `_. -This is purposefully built on an ancient Linux distribution, so it should work -on most versions of Linux that are currently out there. - -Once you have the binary, do the following:: - - chmod +x ./loopy-centos6 - ./loopy-centos6 --target=opencl hello-loopy.loopy - ./loopy-centos6 --target=cuda hello-loopy.loopy - ./loopy-centos6 --target=ispc hello-loopy.loopy - -Grab the example here: :download:`examples/python/hello-loopy.loopy <../examples/python/hello-loopy.loopy>`. - -You may also donwload the most recent version by going to the `list of builds -`_, clicking on the newest one -of type "CentOS binary", clicking on "Browse" under "Build Artifacts", then -navigating to "build-helpers", and downloading the binary from there. - Places on the web related to Loopy ---------------------------------- diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst index 409cbef576d654be973dd6d1424ac40d3ea60982..af35221ad5dcd736190e40a454656a7fa069a787 100644 --- a/doc/ref_kernel.rst +++ b/doc/ref_kernel.rst @@ -151,6 +151,42 @@ Tag Meaning .. }}} +Identifiers +----------- + +Reserved Identifiers +^^^^^^^^^^^^^^^^^^^^ + +The identifier prefix ``_lp_`` is reserved for internal usage; when creating +*inames*, *argument names*, *temporary variable names*, *substitution rule +names*, *instruction IDs*, and other identifiers, users should *not* use names +beginning with ``_lp_``. This prefix is used for identifiers created +internally when operating on Loopy's kernel IR. For Loopy developers, further +information on name prefixes used within submodules is below. + +Identifier Registry +^^^^^^^^^^^^^^^^^^^ + +Functionality in :mod:`loopy` *must* use identifiers beginning with ``_lp_`` for +all internally-created identifiers. Additionally, each name beginning with +``_lp_`` must start with one of the reserved prefixes below. New prefixes may +be registered by adding them to the table below. New prefixes may not themselves +be the prefix of an existing prefix. + +**Reserved Identifier Prefixes** + +======================= ================================== +Reserved Prefix Usage (module or purpose) +======================= ================================== +``_lp_linchk_`` :mod:`loopy.linearization.checker` +======================= ================================== + +.. note:: + + Existing Loopy code may not yet fully satisfy these naming requirements. + Name changes are in progress, and prefixes will be added to this registry + as they are created. + .. _instructions: Instructions diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst index 740c5cb5848dbb7c6f657011bfc23fa88ca173ec..57d33b53999e06cbb07cc8363bbc46c091033cb3 100644 --- a/doc/ref_transform.rst +++ b/doc/ref_transform.rst @@ -118,7 +118,7 @@ Finishing up .. autofunction:: generate_loop_schedules -.. autofunction:: get_one_scheduled_kernel +.. autofunction:: get_one_linearized_kernel .. autofunction:: save_and_reload_temporaries diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 753b09b5da42835b88a000bc0400fa18a254d80f..1b017f701f8161e93c4fdc1c14644dfe4b4fa74c 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1204,16 +1204,16 @@ Here is what happens when we try to generate code for the kernel: This happens due to the kernel splitting done by :mod:`loopy`. The splitting happens when the instruction schedule is generated. To see the schedule, we -should call :func:`loopy.get_one_scheduled_kernel`: +should call :func:`loopy.get_one_linearized_kernel`: - >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + >>> knl = lp.get_one_linearized_kernel(lp.preprocess_kernel(knl)) >>> print(knl) --------------------------------------------------------------------------- KERNEL: rotate_v2 --------------------------------------------------------------------------- ... --------------------------------------------------------------------------- - SCHEDULE: + LINEARIZATION: 0: CALL KERNEL rotate_v2(extra_args=[], extra_inames=[]) 1: tmp = arr[i_inner + i_outer*16] {id=maketmp} 2: RETURN FROM KERNEL rotate_v2 @@ -1233,12 +1233,12 @@ goes for local temporaries). :func:`loopy.save_and_reload_temporaries` for the purpose of handling the task of saving and restoring temporary values across global barriers. This function adds instructions to the kernel without scheduling them. That means -that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to +that :func:`loopy.get_one_linearized_kernel` needs to be called one more time to put those instructions into the schedule. - >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + >>> knl = lp.get_one_linearized_kernel(lp.preprocess_kernel(knl)) >>> knl = lp.save_and_reload_temporaries(knl) - >>> knl = lp.get_one_scheduled_kernel(knl) # Schedule added instructions + >>> knl = lp.get_one_linearized_kernel(knl) # Schedule added instructions >>> print(knl) --------------------------------------------------------------------------- KERNEL: rotate_v2 @@ -1251,7 +1251,7 @@ put those instructions into the schedule. --------------------------------------------------------------------------- ... --------------------------------------------------------------------------- - SCHEDULE: + LINEARIZATION: 0: CALL KERNEL rotate_v2(extra_args=['tmp_save_slot'], extra_inames=[]) 1: tmp = arr[i_inner + i_outer*16] {id=maketmp} 2: tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] = tmp {id=tmp.save} diff --git a/examples/fortran/matmul-driver.py b/examples/fortran/matmul-driver.py new file mode 100644 index 0000000000000000000000000000000000000000..9db569480d521e58210c030e742386cd12dc8d37 --- /dev/null +++ b/examples/fortran/matmul-driver.py @@ -0,0 +1,35 @@ +import numpy as np +import numpy.linalg as la +import pyopencl as cl +import pyopencl.array +import pyopencl.clrandom +import loopy as lp + + +def main(): + fn = "matmul.floopy" + with open(fn, "r") as inf: + source = inf.read() + + dgemm, = lp.parse_transformed_fortran(source, filename=fn) + + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + + n = 2048 + a = cl.array.empty(queue, (n, n), dtype=np.float64, order="F") + b = cl.array.empty(queue, (n, n), dtype=np.float64, order="F") + c = cl.array.zeros(queue, (n, n), dtype=np.float64, order="F") + cl.clrandom.fill_rand(a) + cl.clrandom.fill_rand(b) + + dgemm = lp.set_options(dgemm, write_code=True) + + dgemm(queue, a=a, b=b, alpha=1, c=c) + + c_ref = (a.get() @ b.get()) + assert la.norm(c_ref - c.get())/la.norm(c_ref) < 1e-10 + + +if __name__ == "__main__": + main() diff --git a/loopy/__init__.py b/loopy/__init__.py index b60de6e2dcd35c1c167bf5e303401f2c6242ebec..807ce88341a8845a154d853077aea649c0938064 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -123,12 +123,12 @@ from loopy.transform.add_barrier import add_barrier from loopy.type_inference import infer_unknown_types from loopy.preprocess import preprocess_kernel, realize_reduction -from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel -from loopy.statistics import (ToCountMap, CountGranularity, stringify_stats_mapping, - Op, MemAccess, get_op_poly, get_op_map, get_lmem_access_poly, - get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_map, - get_synchronization_poly, get_synchronization_map, - gather_access_footprints, gather_access_footprint_bytes) +from loopy.schedule import ( + generate_loop_schedules, get_one_scheduled_kernel, get_one_linearized_kernel) +from loopy.statistics import (ToCountMap, CountGranularity, + stringify_stats_mapping, Op, MemAccess, get_op_map, get_mem_access_map, + get_synchronization_map, gather_access_footprints, + gather_access_footprint_bytes) from loopy.codegen import ( PreambleInfo, generate_code, generate_code_v2, generate_body) @@ -248,16 +248,16 @@ __all__ = [ "infer_unknown_types", "preprocess_kernel", "realize_reduction", - "generate_loop_schedules", "get_one_scheduled_kernel", + "generate_loop_schedules", + "get_one_scheduled_kernel", "get_one_linearized_kernel", "GeneratedProgram", "CodeGenerationResult", "PreambleInfo", "generate_code", "generate_code_v2", "generate_body", "ToCountMap", "CountGranularity", "stringify_stats_mapping", "Op", - "MemAccess", "get_op_poly", "get_op_map", "get_lmem_access_poly", - "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_map", - "get_synchronization_poly", "get_synchronization_map", - "gather_access_footprints", "gather_access_footprint_bytes", + "MemAccess", "get_op_map", "get_mem_access_map", + "get_synchronization_map", "gather_access_footprints", + "gather_access_footprint_bytes", "CompiledKernel", diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 6837b99a026debf32b12aceef00ed3863c620639..ca70c8489238ee6f1fd95f52b02dbe451ddf13ef 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -534,7 +534,7 @@ def auto_test_vs_ref( from loopy.target.pyopencl import PyOpenCLTarget if test_knl.state not in [ KernelState.PREPROCESSED, - KernelState.SCHEDULED]: + KernelState.LINEARIZED]: if isinstance(test_knl.target, PyOpenCLTarget): test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0])) diff --git a/loopy/check.py b/loopy/check.py index cc87ad9872668bf5323aefd79944e3bbd71b1153..da49c1d116df1a9fbf92e8ef41822b6741405604 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -184,6 +184,19 @@ def check_for_inactive_iname_access(kernel): ", ".join(expression_inames - kernel.insn_inames(insn)))) +def check_for_unused_inames(kernel): + # Warn if kernel has unused inames + from loopy.transform.iname import get_used_inames + unused_inames = kernel.all_inames() - get_used_inames(kernel) + if unused_inames: + warn_with_kernel( + kernel, "unused_inames", + "Found unused inames in kernel: %s " + "Unused inames during linearization will be prohibited in " + "Loopy version 2021.X." + % unused_inames) + + def _is_racing_iname_tag(tv, tag): from loopy.kernel.data import (AddressSpace, LocalIndexTagBase, GroupIndexTag, ConcurrentTag, auto) @@ -220,12 +233,12 @@ def check_for_write_races(kernel): assignee_inames = assignee_indices & kernel.all_inames() if not assignee_inames <= kernel.insn_inames(insn): raise LoopyError( - "assignee of instructiosn '%s' references " + "assignee of instructions '%s' references " "iname that the instruction does not depend on" % insn.id) if assignee_name in kernel.arg_dict: - # Any parallel tags that are not depended upon by the assignee + # Any concurrent tags that are not depended upon by the assignee # will cause write races. raceable_parallel_insn_inames = set( @@ -658,6 +671,7 @@ def pre_schedule_checks(kernel): check_loop_priority_inames_known(kernel) check_multiple_tags_allowed(kernel) check_for_inactive_iname_access(kernel) + check_for_unused_inames(kernel) check_for_write_races(kernel) check_for_data_dependent_parallel_bounds(kernel) check_bounds(kernel) diff --git a/loopy/cli.py b/loopy/cli.py index a92922b1845d76dd7a700a93c05de3eecf8c28dd..cdc24800be0edf3935aacccdd4dc4d9905cf5965 100644 --- a/loopy/cli.py +++ b/loopy/cli.py @@ -39,13 +39,13 @@ def defines_to_python_code(defines_str): import re define_re = re.compile(r"^\#define\s+([a-zA-Z0-9_]+)\s+(.*)$") result = [] - for l in defines_str.split("\n"): - if not l.strip(): + for line in defines_str.split("\n"): + if not line.strip(): continue - match = define_re.match(l) + match = define_re.match(line) if match is None: - raise RuntimeError("#define not understood: '%s'" % l) + raise RuntimeError("#define not understood: '%s'" % line) result.append( "%s = %s" % (match.group(1), to_python_literal(match.group(2)))) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 11f874e1bd90bcfc1fe4595345c1b1efb2e6a35f..b4811dc9966921fa612aabef9a726d6b53fd4052 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -388,7 +388,7 @@ def generate_code_v2(kernel): from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) - if kernel.state != KernelState.SCHEDULED: + if kernel.state != KernelState.LINEARIZED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index c946e09a086e574a2593d60f652a81773d95a1fe..b736191ec1dadb842e12453fbec3b68e831338f6 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -59,6 +59,7 @@ def get_usable_inames_for_conditional(kernel, sched_index): from loopy.schedule import ( find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase, + VectorizeTag, IlpBaseTag) result = find_active_inames_at(kernel, sched_index) @@ -67,7 +68,7 @@ def get_usable_inames_for_conditional(kernel, sched_index): # Find our containing subkernel. Grab inames for all insns from there. within_subkernel = False - for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index+1]): + for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index]): from loopy.schedule import CallKernel, ReturnFromKernel if isinstance(sched_item, CallKernel): within_subkernel = True @@ -92,11 +93,12 @@ def get_usable_inames_for_conditional(kernel, sched_index): # # - local indices may not be used in conditionals that cross barriers. # - # - ILP indices are not available in loop bounds, they only get defined - # at the innermost level of nesting. + # - ILP indices and vector lane indices are not available in loop + # bounds, they only get defined at the innermost level of nesting. if ( kernel.iname_tags_of_type(iname, ConcurrentTag) + and not kernel.iname_tags_of_type(iname, VectorizeTag) and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase) and crosses_barrier) and not kernel.iname_tags_of_type(iname, IlpBaseTag) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index e1520a82ed69fa2aed729d9b1d849a78d658c4e1..7319b16ac2fe9f39872558a3878161b89cab15d9 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -24,7 +24,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six from loopy.codegen.result import merge_codegen_results, wrap_in_if import islpy as isl from loopy.schedule import ( @@ -33,30 +32,6 @@ from loopy.schedule import ( from loopy.diagnostic import LoopyError -def get_admissible_conditional_inames_for(codegen_state, sched_index): - """This function disallows conditionals on local-idx tagged - inames if there is a barrier nested somewhere within. - """ - - kernel = codegen_state.kernel - - from loopy.kernel.data import (LocalIndexTag, HardwareConcurrentTag, - filter_iname_tags_by_type) - - from loopy.schedule import find_active_inames_at, has_barrier_within - result = find_active_inames_at(kernel, sched_index) - - has_barrier = has_barrier_within(kernel, sched_index) - - for iname, tags in six.iteritems(kernel.iname_to_tags): - if (filter_iname_tags_by_type(tags, HardwareConcurrentTag) - and codegen_state.is_generating_device_code): - if not has_barrier or not filter_iname_tags_by_type(tags, LocalIndexTag): - result.add(iname) - - return frozenset(result) - - def synthesize_idis_for_extra_args(kernel, schedule_index): """ :returns: A list of :class:`loopy.codegen.ImplementedDataInfo` @@ -222,14 +197,14 @@ def get_required_predicates(kernel, sched_index): return result -def group_by(l, key, merge): - if not l: - return l +def group_by(entry, key, merge): + if not entry: + return entry result = [] - previous = l[0] + previous = entry[0] - for item in l[1:]: + for item in entry[1:]: if key(previous) == key(item): previous = merge(previous, item) @@ -302,11 +277,13 @@ def build_loop_nest(codegen_state, schedule_index): """ from loopy.schedule import find_used_inames_within + from loopy.codegen.bounds import get_usable_inames_for_conditional + sched_index_info_entries = [ ScheduleIndexInfo( schedule_indices=[i], admissible_cond_inames=( - get_admissible_conditional_inames_for(codegen_state, i)), + get_usable_inames_for_conditional(kernel, i)), required_predicates=get_required_predicates(kernel, i), used_inames_within=find_used_inames_within(kernel, i) ) diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 5e0747246160ddc2934c3d545c03a2a9b4090d5d..c0ca875c0e9b661becb1bb0ca6e81139a8a93e2d 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -274,7 +274,7 @@ def generate_c_instruction_code(codegen_state, insn): if body: body.append(Line()) - body.extend(Line(l) for l in insn.code.split("\n")) + body.extend(Line(line) for line in insn.code.split("\n")) return Block(body) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 128e4fbc85a2a03e25da3f88b200e67eb41756d3..b3a87798840bb1624d350c79830f29142e54ab6c 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -231,7 +231,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, kernel = codegen_state.kernel from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag, - LocalIndexTag, GroupIndexTag) + LocalIndexTag, GroupIndexTag, VectorizeTag) from loopy.schedule import get_insn_ids_for_block_at insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index) @@ -242,7 +242,8 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, all_inames_by_insns |= kernel.insn_inames(insn_id) hw_inames_left = [iname for iname in all_inames_by_insns - if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)] + if kernel.iname_tags_of_type(iname, HardwareConcurrentTag) + and not kernel.iname_tags_of_type(iname, VectorizeTag)] if not hw_inames_left: return next_func(codegen_state) diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index 05b0a92050a51be1cd980648325921fbf13768d8..40202d4da3319c0ef24b0317f01cd4d31f88d484 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -86,17 +86,17 @@ def _extract_loopy_lines(source): loopy_lines = [] in_loopy_code = False - for l in lines: - comment_match = comment_re.match(l) + for line in lines: + comment_match = comment_re.match(line) if comment_match is None: if in_loopy_code: raise LoopyError("non-comment source line in loopy block") - remaining_lines.append(l) + remaining_lines.append(line) # Preserves line numbers in loopy code, for debuggability - loopy_lines.append("# "+l) + loopy_lines.append("# "+line) continue cmt = comment_match.group(1) @@ -108,7 +108,7 @@ def _extract_loopy_lines(source): in_loopy_code = True # Preserves line numbers in loopy code, for debuggability - loopy_lines.append("# "+l) + loopy_lines.append("# "+line) elif cmt_stripped == "$loopy end": if not in_loopy_code: @@ -116,16 +116,16 @@ def _extract_loopy_lines(source): in_loopy_code = False # Preserves line numbers in loopy code, for debuggability - loopy_lines.append("# "+l) + loopy_lines.append("# "+line) elif in_loopy_code: loopy_lines.append(cmt) else: - remaining_lines.append(l) + remaining_lines.append(line) # Preserves line numbers in loopy code, for debuggability - loopy_lines.append("# "+l) + loopy_lines.append("# "+line) return "\n".join(remaining_lines), "\n".join(loopy_lines) diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index d7a1b2498af583bc9ff97ba743ccc5ed8bd25d3a..91a5fdc88f02a99c6064f6b9944b08de662a27a8 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -339,11 +339,11 @@ class F2LoopyTranslator(FTreeWalkerBase): return [] - map_Logical = map_type_decl - map_Integer = map_type_decl - map_Real = map_type_decl - map_Complex = map_type_decl - map_DoublePrecision = map_type_decl + map_Logical = map_type_decl # noqa: N815 + map_Integer = map_type_decl # noqa: N815 + map_Real = map_type_decl # noqa: N815 + map_Complex = map_type_decl # noqa: N815 + map_DoublePrecision = map_type_decl # noqa: N815 def map_Dimension(self, node): scope = self.scope_stack[-1] diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 80a7ad03101bc67f39c89c6089aa6533d1886185..2d926aad4faa511aa2919630c9b0e96b7f253ad9 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -35,14 +35,13 @@ import islpy as isl from islpy import dim_type import re -from pytools import UniqueNameGenerator, generate_unique_names +from pytools import UniqueNameGenerator, generate_unique_names, natsorted from loopy.library.function import ( default_function_mangler, single_arg_function_mangler) from loopy.diagnostic import CannotBranchDomainTree, LoopyError -from loopy.tools import natsorted from loopy.diagnostic import StaticValueFindingError from loopy.kernel.data import filter_iname_tags_by_type from warnings import warn @@ -99,10 +98,25 @@ class _UniqueVarNameGenerator(UniqueNameGenerator): # {{{ loop kernel object +class _deprecated_KernelState_SCHEDULED(object): # noqa + def __init__(self, f): + self.f = f + + def __get__(self, obj, klass): + warn( + "'KernelState.SCHEDULED' is deprecated. " + "Use 'KernelState.LINEARIZED'.", + DeprecationWarning, stacklevel=2) + return self.f() + class KernelState: # noqa INITIAL = 0 PREPROCESSED = 1 - SCHEDULED = 2 + LINEARIZED = 2 + + @_deprecated_KernelState_SCHEDULED + def SCHEDULED(): # pylint:disable=no-method-argument + return KernelState.LINEARIZED # {{{ kernel_state, KernelState compataibility @@ -228,7 +242,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): # {{{ constructor - def __init__(self, domains, instructions, args=None, schedule=None, + def __init__(self, domains, instructions, args=None, + schedule=None, + linearization=None, name="loopy_kernel", preambles=None, preamble_generators=None, @@ -333,10 +349,27 @@ class LoopKernel(ImmutableRecordWithoutPickling): if state not in [ KernelState.INITIAL, KernelState.PREPROCESSED, - KernelState.SCHEDULED, + KernelState.LINEARIZED, ]: raise ValueError("invalid value for 'state'") + # `linearization` is replacing `schedule`, but we're not changing + # this under the hood yet, so for now, store it inside `schedule` + # and raise deprecation warning anyway + if schedule is not None: + if linearization is not None: + # these should not both be present + raise ValueError( + "received both `schedule` and `linearization` args, " + "'LoopKernel.schedule' is deprecated. " + "Use 'LoopKernel.linearization'.") + warn( + "'LoopKernel.schedule' is deprecated. " + "Use 'LoopKernel.linearization'.", + DeprecationWarning, stacklevel=2) + elif linearization is not None: + schedule = linearization + from collections import defaultdict assert not isinstance(iname_to_tags, defaultdict) @@ -1345,7 +1378,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): if "schedule" in what and kernel.schedule is not None: lines.extend(sep) if show_labels: - lines.append("SCHEDULE:") + lines.append("LINEARIZATION:") from loopy.schedule import dump_schedule lines.append(dump_schedule(kernel, kernel.schedule)) @@ -1395,6 +1428,14 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} + # {{{ handle linearization variable that doesn't yet exist + + @property + def linearization(self): + return self.schedule + + # }}} + # {{{ direct execution def __call__(self, *args, **kwargs): diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 9ac38fc87a27da13e98515085edd6f2e35b1fcd7..e6544b34a55af97a1a15e86f7d74855e08e53116 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -186,7 +186,7 @@ class LoopedIlpTag(IlpBaseTag): # }}} -class VectorizeTag(UniqueTag): +class VectorizeTag(UniqueTag, HardwareConcurrentTag): def __str__(self): return "vec" diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 8213c9584b54917050c586e1b83b6d66d0473798..61127232a9f494fe2fdc536dd50d8fdf41b8f17c 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -66,7 +66,8 @@ class InstructionBase(ImmutableRecord): .. attribute:: depends_on_is_final A :class:`bool` determining whether :attr:`depends_on` constitutes - the *entire* list of iname dependencies. + the *entire* list of iname dependencies. If *not* marked final, + various semi-broken heuristics will try to add further dependencies. Defaults to *False*. @@ -344,10 +345,13 @@ class InstructionBase(ImmutableRecord): """ raise NotImplementedError - def with_transformed_expressions(self, f, *args): + def with_transformed_expressions(self, f, assignee_f=None): """Return a new copy of *self* where *f* has been applied to every expression occurring in *self*. *args* will be passed as extra arguments (in addition to the expression) to *f*. + + If *assignee_f* is passed, then left-hand sides of assignments are + passed to it. If it is not given, it defaults to the same as *f*. """ raise NotImplementedError @@ -959,12 +963,15 @@ class Assignment(MultiAssignmentBase): def assignee_subscript_deps(self): return (_get_assignee_subscript_deps(self.assignee),) - def with_transformed_expressions(self, f, *args): + def with_transformed_expressions(self, f, assignee_f=None): + if assignee_f is None: + assignee_f = f + return self.copy( - assignee=f(self.assignee, *args), - expression=f(self.expression, *args), + assignee=assignee_f(self.assignee), + expression=f(self.expression), predicates=frozenset( - f(pred, *args) for pred in self.predicates)) + f(pred) for pred in self.predicates)) # }}} @@ -1114,12 +1121,15 @@ class CallInstruction(MultiAssignmentBase): _get_assignee_subscript_deps(a) for a in self.assignees) - def with_transformed_expressions(self, f, *args): + def with_transformed_expressions(self, f, assignee_f=None): + if assignee_f is None: + assignee_f = f + return self.copy( - assignees=f(self.assignees, *args), - expression=f(self.expression, *args), + assignees=assignee_f(self.assignees), + expression=f(self.expression), predicates=frozenset( - f(pred, *args) for pred in self.predicates)) + f(pred) for pred in self.predicates)) # }}} @@ -1315,14 +1325,17 @@ class CInstruction(InstructionBase): _get_assignee_subscript_deps(a) for a in self.assignees) - def with_transformed_expressions(self, f, *args): + def with_transformed_expressions(self, f, assignee_f=None): + if assignee_f is None: + assignee_f = f + return self.copy( iname_exprs=[ - (name, f(expr, *args)) + (name, f(expr)) for name, expr in self.iname_exprs], - assignees=[f(a, *args) for a in self.assignees], + assignees=[assignee_f(a) for a in self.assignees], predicates=frozenset( - f(pred, *args) for pred in self.predicates)) + f(pred) for pred in self.predicates)) # }}} @@ -1357,7 +1370,7 @@ class _DataObliviousInstruction(InstructionBase): def assignee_subscript_deps(self): return frozenset() - def with_transformed_expressions(self, f, *args): + def with_transformed_expressions(self, f, assignee_f=None): return self.copy( predicates=frozenset( f(pred) for pred in self.predicates)) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 157099df5a2133baa109f24e8216d63577b5dcb4..e33d260fba4f3f4122f35e033ecc573b41999d5d 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -34,8 +34,7 @@ import numpy as np import islpy as isl from islpy import dim_type from loopy.diagnostic import LoopyError, warn_with_kernel -from pytools import memoize_on_first_arg -from loopy.tools import natsorted +from pytools import memoize_on_first_arg, natsorted import logging logger = logging.getLogger(__name__) @@ -1381,7 +1380,7 @@ def draw_dependencies_as_unicode_arrows( .replace(style.RESET_ALL, "")) return len(s) - def truncate_without_color_escapes(s, l): + def truncate_without_color_escapes(s, length): # FIXME: This is a bit dumb--it removes color escapes when truncation # is needed. @@ -1389,7 +1388,7 @@ def draw_dependencies_as_unicode_arrows( .replace(fore.RED, "") .replace(style.RESET_ALL, "")) - return s[:l] + u"…" + return s[:length] + u"…" def conform_to_uniform_length(s): len_s = len_without_color_escapes(s) @@ -1428,6 +1427,8 @@ def stringify_instruction_list(kernel): def insert_insn_into_order(insn): if insn.id in printed_insn_ids: + # Note: dependency cycles are deliberately ignored so that printing + # succeeds. return printed_insn_ids.add(insn.id) @@ -1511,7 +1512,7 @@ def stringify_instruction_list(kernel): ", ".join("%s=%s" % (name, expr) for name, expr in insn.iname_exprs)) - trailing = [l for l in insn.code.split("\n")] + trailing = insn.code.split("\n") elif isinstance(insn, lp.BarrierInstruction): lhs = "" rhs = "... %sbarrier" % insn.synchronization_kind[0] @@ -1583,6 +1584,13 @@ def stringify_instruction_list(kernel): # {{{ global barrier order finding +def _is_global_barrier(kernel, insn_id): + insn = kernel.id_to_insn[insn_id] + from loopy.kernel.instruction import BarrierInstruction + return isinstance(insn, BarrierInstruction) and \ + insn.synchronization_kind == "global" + + @memoize_on_first_arg def get_global_barrier_order(kernel): """Return a :class:`tuple` of the listing the ids of global barrier instructions @@ -1590,49 +1598,27 @@ def get_global_barrier_order(kernel): See also :class:`loopy.instruction.BarrierInstruction`. """ - barriers = [] - visiting = set() - visited = set() - - unvisited = set(insn.id for insn in kernel.instructions) - - def is_barrier(my_insn_id): - insn = kernel.id_to_insn[my_insn_id] - from loopy.kernel.instruction import BarrierInstruction - return isinstance(insn, BarrierInstruction) and \ - insn.synchronization_kind == "global" - - while unvisited: - stack = [unvisited.pop()] - - while stack: - top = stack[-1] - - if top in visiting: - visiting.remove(top) - if is_barrier(top): - barriers.append(top) + dep_graph = {insn.id: set() for insn in kernel.instructions} + for insn in kernel.instructions: + for dep in insn.depends_on: + dep_graph[dep].add(insn.id) - if top in visited: - stack.pop() - continue + from pytools.graph import compute_topological_order + order = compute_topological_order(dep_graph) - visited.add(top) - visiting.add(top) + barriers = [ + insn_id for insn_id in order + if _is_global_barrier(kernel, insn_id)] - for child in kernel.id_to_insn[top].depends_on: - # Check for no cycles. - assert child not in visiting - stack.append(child) + del order # Ensure this is the only possible order. # # We do this by looking at the barriers in order. # We check for each adjacent pair (a,b) in the order if a < b, # i.e. if a is reachable by a chain of dependencies from b. - - visiting.clear() - visited.clear() + visited = set() + visiting = set() for prev_barrier, barrier in zip(barriers, barriers[1:]): # Check if prev_barrier is reachable from barrier. @@ -1690,12 +1676,6 @@ def find_most_recent_global_barrier(kernel, insn_id): if len(insn.depends_on) == 0: return None - def is_barrier(my_insn_id): - insn = kernel.id_to_insn[my_insn_id] - from loopy.kernel.instruction import BarrierInstruction - return isinstance(insn, BarrierInstruction) and \ - insn.synchronization_kind == "global" - global_barrier_to_ordinal = dict( (b, i) for i, b in enumerate(global_barrier_order)) @@ -1705,7 +1685,7 @@ def find_most_recent_global_barrier(kernel, insn_id): else -1) direct_barrier_dependencies = set( - dep for dep in insn.depends_on if is_barrier(dep)) + dep for dep in insn.depends_on if _is_global_barrier(kernel, dep)) if len(direct_barrier_dependencies) > 0: return max(direct_barrier_dependencies, key=get_barrier_ordinal) @@ -1727,8 +1707,8 @@ def get_subkernels(kernel): See also :class:`loopy.schedule.CallKernel`. """ from loopy.kernel import KernelState - if kernel.state != KernelState.SCHEDULED: - raise LoopyError("Kernel must be scheduled") + if kernel.state != KernelState.LINEARIZED: + raise LoopyError("Kernel must be linearized") from loopy.schedule import CallKernel @@ -1744,7 +1724,7 @@ def get_subkernel_to_insn_id_map(kernel): kernel must be scheduled. """ from loopy.kernel import KernelState - if kernel.state != KernelState.SCHEDULED: + if kernel.state != KernelState.LINEARIZED: raise LoopyError("Kernel must be scheduled") from loopy.schedule import ( diff --git a/loopy/preprocess.py b/loopy/preprocess.py index c0eb91ea60317ef8cad1c594571d46bba2d1a671..de81815a82655136941b57b1f78486aed39237da 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -37,6 +37,7 @@ from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment, filter_iname_tags_by_type # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types +from loopy.transform.iname import remove_any_newly_unused_inames import logging logger = logging.getLogger(__name__) @@ -289,7 +290,7 @@ def _classify_reduction_inames(kernel, inames): nonlocal_par = [] from loopy.kernel.data import ( - LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, + LocalIndexTagBase, UnrolledIlpTag, UnrollTag, ConcurrentTag, filter_iname_tags_by_type) for iname in inames: @@ -303,7 +304,7 @@ def _classify_reduction_inames(kernel, inames): elif filter_iname_tags_by_type(iname_tags, LocalIndexTagBase): local_par.append(iname) - elif filter_iname_tags_by_type(iname_tags, (ConcurrentTag, VectorizeTag)): + elif filter_iname_tags_by_type(iname_tags, ConcurrentTag): nonlocal_par.append(iname) else: @@ -882,6 +883,7 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): # }}} +@remove_any_newly_unused_inames def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False, force_scan=False, force_outer_iname_for_scan=None): @@ -1370,7 +1372,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, track_iname = var_name_gen( "{sweep_iname}__seq_scan" - .format(scan_iname=scan_iname, sweep_iname=sweep_iname)) + .format(sweep_iname=sweep_iname)) get_or_add_sweep_tracking_iname_and_domain( scan_iname, sweep_iname, sweep_min_value, scan_min_value, @@ -1480,7 +1482,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, track_iname = var_name_gen( "{sweep_iname}__pre_scan" - .format(scan_iname=scan_iname, sweep_iname=sweep_iname)) + .format(sweep_iname=sweep_iname)) get_or_add_sweep_tracking_iname_and_domain( scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride, @@ -1924,8 +1926,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, kernel = lp.tag_inames(kernel, new_iname_tags) - # TODO: remove unused inames... - kernel = ( _hackily_ensure_multi_assignment_return_values_are_scoped_private( kernel)) @@ -1979,7 +1979,7 @@ def find_idempotence(kernel): # Find SCCs of dep_graph. These are used for checking if the instruction is # in a dependency cycle. - from loopy.tools import compute_sccs + from pytools.graph import compute_sccs sccs = dict((item, scc) for scc in compute_sccs(dep_graph) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index fb0d0e2c17005ecf051d7034fd7903ed5262bdfc..032cdc2760597f1fa6f701a8a88252312deac797 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -212,12 +212,12 @@ def find_loop_nest_with_map(kernel): """ result = {} - from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag + from loopy.kernel.data import ConcurrentTag, IlpBaseTag all_nonpar_inames = set( iname for iname in kernel.all_inames() if not kernel.iname_tags_of_type(iname, - (ConcurrentTag, IlpBaseTag, VectorizeTag))) + (ConcurrentTag, IlpBaseTag))) iname_to_insns = kernel.iname_to_insns() @@ -276,7 +276,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): result = {} - from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag + from loopy.kernel.data import ConcurrentTag, IlpBaseTag for insn in kernel.instructions: for iname in kernel.insn_inames(insn): if kernel.iname_tags_of_type(iname, ConcurrentTag): @@ -310,7 +310,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): continue if kernel.iname_tags_of_type(dep_insn_iname, - (ConcurrentTag, IlpBaseTag, VectorizeTag)): + (ConcurrentTag, IlpBaseTag)): # Parallel tags don't really nest, so we'll disregard # them here. continue @@ -1841,7 +1841,7 @@ def generate_loop_schedules(kernel, debug_args={}): def generate_loop_schedules_inner(kernel, debug_args={}): from loopy.kernel import KernelState - if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED): + if kernel.state not in (KernelState.PREPROCESSED, KernelState.LINEARIZED): raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") @@ -1852,7 +1852,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}): debug = ScheduleDebugger(**debug_args) - preschedule = kernel.schedule if kernel.state == KernelState.SCHEDULED else () + preschedule = kernel.schedule if kernel.state == KernelState.LINEARIZED else () prescheduled_inames = set( insn.iname @@ -1904,7 +1904,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}): unscheduled_insn_ids=set(insn.id for insn in kernel.instructions), scheduled_insn_ids=frozenset(), - within_subkernel=kernel.state != KernelState.SCHEDULED, + within_subkernel=kernel.state != KernelState.LINEARIZED, may_schedule_global_barriers=True, preschedule=preschedule, @@ -1973,11 +1973,11 @@ def generate_loop_schedules_inner(kernel, debug_args={}): new_kernel = kernel.copy( schedule=gen_sched, - state=KernelState.SCHEDULED) + state=KernelState.LINEARIZED) from loopy.schedule.device_mapping import \ map_schedule_onto_host_or_device - if kernel.state != KernelState.SCHEDULED: + if kernel.state != KernelState.LINEARIZED: # Device mapper only gets run once. new_kernel = map_schedule_onto_host_or_device(new_kernel) @@ -2029,6 +2029,15 @@ def _get_one_scheduled_kernel_inner(kernel): def get_one_scheduled_kernel(kernel): + warn_with_kernel( + kernel, "get_one_scheduled_kernel_deprecated", + "get_one_scheduled_kernel is deprecated. " + "Use get_one_linearized_kernel instead.", + DeprecationWarning) + return get_one_linearized_kernel(kernel) + + +def get_one_linearized_kernel(kernel): from loopy import CACHING_ENABLED sched_cache_key = kernel diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py index 59afb07d2e9b7713dbe86c2c5aef7356decbbcff..d45c1ecbdc7ea091ce7d1a3899e82c14bb6fef2b 100644 --- a/loopy/schedule/device_mapping.py +++ b/loopy/schedule/device_mapping.py @@ -31,7 +31,7 @@ from loopy.schedule.tools import get_block_boundaries def map_schedule_onto_host_or_device(kernel): # FIXME: Should be idempotent. from loopy.kernel import KernelState - assert kernel.state == KernelState.SCHEDULED + assert kernel.state == KernelState.LINEARIZED from functools import partial device_prog_name_gen = partial( diff --git a/loopy/statistics.py b/loopy/statistics.py index 10d29daad062744ca3fbe2dc2261be4cd2c4ca99..32fe7741e1298c99e2baf74f3e08e67fc8b2a63e 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1863,75 +1863,4 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False): # }}} - -# {{{ compat goop - -def get_lmem_access_poly(knl): - """Count the number of local memory accesses in a loopy kernel. - - get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the - result with the mtype=['local'] option. - - """ - warn_with_kernel(knl, "deprecated_get_lmem_access_poly", - "get_lmem_access_poly is deprecated. Use " - "get_mem_access_map and filter the result with the " - "mtype=['local'] option.") - return get_mem_access_map(knl).filter_by(mtype=['local']) - - -def get_DRAM_access_poly(knl): - """Count the number of global memory accesses in a loopy kernel. - - get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the - result with the mtype=['global'] option. - - """ - warn_with_kernel(knl, "deprecated_get_DRAM_access_poly", - "get_DRAM_access_poly is deprecated. Use " - "get_mem_access_map and filter the result with the " - "mtype=['global'] option.") - return get_mem_access_map(knl).filter_by(mtype=['global']) - - -def get_gmem_access_poly(knl): - """Count the number of global memory accesses in a loopy kernel. - - get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the - result with the mtype=['global'] option. - - """ - warn_with_kernel(knl, "deprecated_get_gmem_access_poly", - "get_DRAM_access_poly is deprecated. Use " - "get_mem_access_map and filter the result with the " - "mtype=['global'] option.") - return get_mem_access_map(knl).filter_by(mtype=['global']) - - -def get_synchronization_poly(knl): - """Count the number of synchronization events each work-item encounters in - a loopy kernel. - - get_synchronization_poly is deprecated. Use get_synchronization_map - instead. - - """ - warn_with_kernel(knl, "deprecated_get_synchronization_poly", - "get_synchronization_poly is deprecated. Use " - "get_synchronization_map instead.") - return get_synchronization_map(knl) - - -def get_op_poly(knl, numpy_types=True): - """Count the number of operations in a loopy kernel. - - get_op_poly is deprecated. Use get_op_map instead. - - """ - warn_with_kernel(knl, "deprecated_get_op_poly", - "get_op_poly is deprecated. Use get_op_map instead.") - return get_op_map(knl, numpy_types) - -# }}} - # vim: foldmethod=marker diff --git a/loopy/symbolic.py b/loopy/symbolic.py index ccac5e199d2b53e202dd735ffd8dfe20a7dc29a2..4156dfcc1673d176ffb609cf280b28c97cc4949f 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -273,8 +273,7 @@ class UnidirectionalUnifier(UnidirectionalUnifierBase): if not isinstance(other, type(expr)): return self.treat_mismatch(expr, other, unis) if (expr.inames != other.inames - or type(expr.operation) != type(other.operation) # noqa - ): + or type(expr.operation) != type(other.operation)): # noqa return [] return self.rec(expr.expr, other.expr, unis) @@ -971,7 +970,8 @@ class RuleAwareIdentityMapper(IdentityMapper): # may perform tasks entirely unrelated to subst rules, so # we must map assignees, too. self.map_instruction(kernel, - insn.with_transformed_expressions(self, kernel, insn)) + insn.with_transformed_expressions( + lambda expr: self(expr, kernel, insn))) for insn in kernel.instructions] return kernel.copy(instructions=new_insns) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 6e3602eda11d5f65e8a6af2977966e946c72a718..8869ebecf3e08bf7921d4c9118dd1fda263adb32 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -80,6 +80,11 @@ class DTypeRegistryWrapper(object): def c99_preamble_generator(preamble_info): if any(dtype.is_integral() for dtype in preamble_info.seen_dtypes): yield("10_stdint", "#include ") + if any(dtype.numpy_dtype == np.dtype("bool") + for dtype in preamble_info.seen_dtypes): + yield("10_stdbool", "#include ") + if any(dtype.is_complex() for dtype in preamble_info.seen_dtypes): + yield("10_complex", "#include ") def _preamble_generator(preamble_info): @@ -436,7 +441,7 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True): arg_dtypes=arg_dtypes) # binary functions - if (name in ["fmax", "fmin"] + if (name in ["fmax", "fmin", "copysign"] and len(arg_dtypes) == 2): dtype = np.find_common_type( @@ -1079,9 +1084,11 @@ class CTarget(CFamilyTarget): @memoize_method def get_dtype_registry(self): from loopy.target.c.compyte.dtypes import ( - DTypeRegistry, fill_registry_with_c99_stdint_types) + DTypeRegistry, fill_registry_with_c99_stdint_types, + fill_registry_with_c99_complex_types) result = DTypeRegistry() fill_registry_with_c99_stdint_types(result) + fill_registry_with_c99_complex_types(result) return DTypeRegistryWrapper(result) diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte index 25ee8b48fd0c7d9f0bd987c6862cdb1884fb1372..7e48e1166a13cfbb7b60f909b071f088034ffda1 160000 --- a/loopy/target/c/compyte +++ b/loopy/target/c/compyte @@ -1 +1 @@ -Subproject commit 25ee8b48fd0c7d9f0bd987c6862cdb1884fb1372 +Subproject commit 7e48e1166a13cfbb7b60f909b071f088034ffda1 diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index c5e8d0a7f7a9f70b3afe46e9d04a3bf861066329..845e0a4326dbb24e509f98c808a9ce3ac3cb52be 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -1,3 +1,4 @@ +# coding: utf-8 """OpenCL target integrated with PyOpenCL.""" from __future__ import division, absolute_import @@ -285,6 +286,9 @@ class PyOpenCLTarget(OpenCLTarget): warnings) and support for complex numbers. """ + # FIXME make prefixes conform to naming rules + # (see Reference: Loopy’s Model of a Kernel) + host_program_name_prefix = "_lpy_host_" host_program_name_suffix = "" @@ -299,7 +303,26 @@ class PyOpenCLTarget(OpenCLTarget): self.device = device self.pyopencl_module_name = pyopencl_module_name - comparison_fields = ["device"] + # NB: Not including 'device', as that is handled specially here. + hash_fields = OpenCLTarget.hash_fields + ( + "pyopencl_module_name",) + comparison_fields = OpenCLTarget.comparison_fields + ( + "pyopencl_module_name",) + + def __eq__(self, other): + if not super(PyOpenCLTarget, self).__eq__(other): + return False + + if (self.device is None) != (other.device is None): + return False + + if self.device is not None: + assert other.device is not None + return (self.device.persistent_unique_id + == other.device.persistent_unique_id) + else: + assert other.device is None + return True def update_persistent_hash(self, key_hash, key_builder): super(PyOpenCLTarget, self).update_persistent_hash(key_hash, key_builder) diff --git a/loopy/tools.py b/loopy/tools.py index 33b6616f32fb6c5fa6e4517e137ef426a806fb3f..a1cd5e108a45ba60c71b3bb7a51f779b84172065 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -210,11 +210,11 @@ def remove_common_indentation(code, require_leading_newline=True, test_line = None if ignore_lines_starting_with: - for l in lines: - strip_l = l.lstrip() + for line in lines: + strip_l = line.lstrip() if (strip_l and not strip_l.startswith(ignore_lines_starting_with)): - test_line = l + test_line = line break else: @@ -355,65 +355,6 @@ def empty_aligned(shape, dtype, order='C', n=64): # }}} -# {{{ compute SCCs with Tarjan's algorithm - -def compute_sccs(graph): - to_search = set(graph.keys()) - visit_order = {} - scc_root = {} - sccs = [] - - while to_search: - top = next(iter(to_search)) - call_stack = [(top, iter(graph[top]), None)] - visit_stack = [] - visiting = set() - - scc = [] - - while call_stack: - top, children, last_popped_child = call_stack.pop() - - if top not in visiting: - # Unvisited: mark as visited, initialize SCC root. - count = len(visit_order) - visit_stack.append(top) - visit_order[top] = count - scc_root[top] = count - visiting.add(top) - to_search.discard(top) - - # Returned from a recursion, update SCC. - if last_popped_child is not None: - scc_root[top] = min( - scc_root[top], - scc_root[last_popped_child]) - - for child in children: - if child not in visit_order: - # Recurse. - call_stack.append((top, children, child)) - call_stack.append((child, iter(graph[child]), None)) - break - if child in visiting: - scc_root[top] = min( - scc_root[top], - visit_order[child]) - else: - if scc_root[top] == visit_order[top]: - scc = [] - while visit_stack[-1] != top: - scc.append(visit_stack.pop()) - scc.append(visit_stack.pop()) - for item in scc: - visiting.remove(item) - sccs.append(scc) - - return sccs - -# }}} - - # {{{ pickled container value class _PickledObject(object): @@ -673,20 +614,4 @@ def is_interned(s): def intern_frozenset_of_ids(fs): return frozenset(intern(s) for s in fs) - -def natorder(key): - # Return natural ordering for strings, as opposed to dictionary order. - # E.g. will result in - # 'abc1' < 'abc9' < 'abc10' - # rather than - # 'abc1' < 'abc10' < 'abc9' - # Based on - # http://code.activestate.com/recipes/285264-natural-string-sorting/#c7 - import re - return [int(n) if n else s for n, s in re.findall(r'(\d+)|(\D+)', key)] - - -def natsorted(seq, key=lambda x: x): - return sorted(seq, key=lambda y: natorder(key(y))) - # vim: foldmethod=marker diff --git a/loopy/transform/data.py b/loopy/transform/data.py index a6a2d7b4fe4ba94caa8cbe112a5cf90719ceb643..1f0161c06868da4a7c71ba1ebf9eab8ef02eeb3d 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -285,15 +285,15 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, if temporary_name is None: temporary_name = var_name_gen("%s_fetch" % c_name) - arg = kernel.arg_dict[var_name] + var_descr = kernel.get_var_descriptor(var_name) # {{{ make parameter names and unification template parameters = [] - for i in range(arg.num_user_axes()): + for i in range(var_descr.num_user_axes()): based_on = "%s_dim_%d" % (c_name, i) - if arg.dim_names is not None: - based_on = "%s_dim_%s" % (c_name, arg.dim_names[i]) + if var_descr.dim_names is not None: + based_on = "%s_dim_%s" % (c_name, var_descr.dim_names[i]) if dim_arg_names is not None and i < len(dim_arg_names): based_on = dim_arg_names[i] @@ -322,7 +322,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, kernel, subst_use, sweep_inames, inames_to_be_removed = \ _process_footprint_subscripts( kernel, rule_name, sweep_inames, - footprint_subscripts, arg) + footprint_subscripts, var_descr) # Our _not_provided is actually a different object from the one in the # precompute module, but precompute acutally uses that to adjust its @@ -331,7 +331,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, from loopy.transform.precompute import precompute new_kernel = precompute(kernel, subst_use, sweep_inames, precompute_inames=dim_arg_names, - default_tag=default_tag, dtype=arg.dtype, + default_tag=default_tag, dtype=var_descr.dtype, fetch_bounding_box=fetch_bounding_box, temporary_name=temporary_name, temporary_address_space=temporary_address_space, diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 96c8252ef7e6622250e9006b2275ef7816700b5c..8432d59ec5b162f6e963abbeae3b2fcabe94cf27 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -977,8 +977,8 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset( # is inspected. For each element of the power set without the # empty and the full set, one duplication option is generated. for insns_to_dup in it.chain.from_iterable( - it.combinations(iname_insns, l) - for l in range(1, len(iname_insns))): + it.combinations(iname_insns, i) + for i in range(1, len(iname_insns))): yield ( iname, tuple(insn | old_common_inames for insn in insns_to_dup)) @@ -1184,6 +1184,19 @@ def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None): # {{{ remove unused inames +def get_used_inames(knl): + import loopy as lp + exp_knl = lp.expand_subst(knl) + + used_inames = set() + for insn in exp_knl.instructions: + used_inames.update( + exp_knl.insn_inames(insn.id) + | insn.reduction_inames()) + + return used_inames + + def remove_unused_inames(knl, inames=None): """Delete those among *inames* that are unused, i.e. project them out of the domain. If these inames pose implicit restrictions on @@ -1204,17 +1217,7 @@ def remove_unused_inames(knl, inames=None): # {{{ check which inames are unused - import loopy as lp - exp_knl = lp.expand_subst(knl) - - inames = set(inames) - used_inames = set() - for insn in exp_knl.instructions: - used_inames.update( - exp_knl.insn_inames(insn.id) - | insn.reduction_inames()) - - unused_inames = inames - used_inames + unused_inames = set(inames) - get_used_inames(knl) # }}} @@ -1235,6 +1238,33 @@ def remove_unused_inames(knl, inames=None): return knl + +def remove_any_newly_unused_inames(transformation_func): + from functools import wraps + + @wraps(transformation_func) + def wrapper(knl, *args, **kwargs): + + # check for remove_unused_inames argument, default: True + remove_newly_unused_inames = kwargs.pop("remove_newly_unused_inames", True) + + if remove_newly_unused_inames: + # determine which inames were already unused + inames_already_unused = knl.all_inames() - get_used_inames(knl) + + # call transform + transformed_knl = transformation_func(knl, *args, **kwargs) + + # Remove inames that are unused due to transform + return remove_unused_inames( + transformed_knl, + transformed_knl.all_inames()-inames_already_unused) + else: + # call transform + return transformation_func(knl, *args, **kwargs) + + return wrapper + # }}} diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 9f426f76bc6902fd09bd7685c73f187df935be1e..b308836c7727564dbfa9625ad39f378e8034c68c 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -229,7 +229,8 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): for insn in kernel.instructions: self.replaced_something = False - insn = insn.with_transformed_expressions(self, kernel, insn) + insn = insn.with_transformed_expressions( + lambda expr: self(expr, kernel, insn)) if self.replaced_something: insn = insn.copy( diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index b92698ffa1e84455be3f79bed7dbf884f36be490..717a051930e938457dae0ee4441325b3e631d2d9 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -25,10 +25,9 @@ THE SOFTWARE. import six from loopy.symbolic import ( - get_dependencies, SubstitutionMapper, RuleAwareIdentityMapper, SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError -from pymbolic.mapper.substitutor import make_subst_func +from loopy.transform.iname import remove_any_newly_unused_inames from pytools import ImmutableRecord from pymbolic import var @@ -80,40 +79,13 @@ def extract_subst(kernel, subst_name, template, parameters=()): # }}} - # {{{ deal with iname deps of template that are not independent_inames - - # (We call these 'matching_vars', because they have to match exactly in - # every CSE. As above, they might need to be renamed to make them unique - # within the kernel.) - - matching_vars = [] - old_to_new = {} - - for iname in (get_dependencies(template) - - set(parameters) - - kernel.non_iname_variable_names()): - if iname in kernel.all_inames(): - # need to rename to be unique - new_iname = var_name_gen(iname) - old_to_new[iname] = var(new_iname) - matching_vars.append(new_iname) - else: - matching_vars.append(iname) - - if old_to_new: - template = ( - SubstitutionMapper(make_subst_func(old_to_new)) - (template)) - - # }}} - # {{{ gather up expressions expr_descriptors = [] from loopy.symbolic import UnidirectionalUnifier unif = UnidirectionalUnifier( - lhs_mapping_candidates=set(parameters) | set(matching_vars)) + lhs_mapping_candidates=set(parameters)) def gather_exprs(expr, mapper): urecs = unif(template, expr) @@ -177,8 +149,30 @@ def extract_subst(kernel, subst_name, template, parameters=()): new_insns = [] + def transform_assignee(expr): + # Assignment LHS's cannot be subst rules. Treat them + # specially. + + import pymbolic.primitives as prim + if isinstance(expr, tuple): + return tuple( + transform_assignee(expr_i) + for expr_i in expr) + + elif isinstance(expr, prim.Subscript): + return type(expr)( + expr.aggregate, + cbmapper(expr.index)) + + elif isinstance(expr, prim.Variable): + return expr + else: + raise ValueError("assignment LHS not understood") + for insn in kernel.instructions: - new_insns.append(insn.with_transformed_expressions(cbmapper)) + new_insns.append( + insn.with_transformed_expressions( + cbmapper, assignee_f=transform_assignee)) from loopy.kernel.data import SubstitutionRule new_substs = { @@ -285,6 +279,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper): return var(subst_name)(*index) +@remove_any_newly_unused_inames def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, force_retain_argument=False): """Extract an assignment (to a temporary variable or an argument) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 010a0658f71bcfcb037a81c6b61fd9417fc98b75..32f039a22a5f8ff076669ecb23f00ad63ed85dd5 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -532,7 +532,7 @@ def infer_unknown_types(kernel, expect_completion=False): if read_var in names_for_type_inference)) for written_var in names_for_type_inference) - from loopy.tools import compute_sccs + from pytools.graph import compute_sccs # To speed up processing, we sort the variables by computing the SCCs of the # type dependency graph. Each SCC represents a set of variables whose types diff --git a/loopy/version.py b/loopy/version.py index 29abbc2de889b884de93e5fe39a1d996811c93c9..d69a3b574122622105e4b52c74ec8c595fc816b6 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -42,7 +42,7 @@ else: # }}} -VERSION = (2019, 1) +VERSION = (2020, 1) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS diff --git a/setup.cfg b/setup.cfg index eec3dfd1f52ed97c58f5281716eac8fc18980094..a0d95746e1a399d6a2d7c315bffc9b834d2f5487 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [flake8] -ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814,W504 +ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814,N817,W504 max-line-length=85 exclude= loopy/target/c/compyte/ndarray, diff --git a/setup.py b/setup.py index 75d8b340e8ad98794a244f7e5da89e079870bd2b..bba29986997e8e762ad52f38feae6311c4892c10 100644 --- a/setup.py +++ b/setup.py @@ -76,10 +76,7 @@ setup(name="loo.py", 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Programming Language :: Python', - 'Programming Language :: Python :: 2.6', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.2', - 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3', 'Topic :: Scientific/Engineering', 'Topic :: Scientific/Engineering :: Information Analysis', 'Topic :: Scientific/Engineering :: Mathematics', @@ -89,7 +86,7 @@ setup(name="loo.py", ], install_requires=[ - "pytools>=2018.4", + "pytools>=2020.2", "pymbolic>=2019.2", "genpy>=2016.1.2", "cgen>=2016.1", diff --git a/test/test_apps.py b/test/test_apps.py index e07262dbdda8ad3c24522f7d0eb4dba8422bf0ce..71029cc9ce408f8e7fa95eaf3b766864c4beee5b 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -566,7 +566,7 @@ def test_poisson_fem(ctx_factory): sdim = 3 knl = lp.make_kernel( - "{ [c,i,j,k,ell,ell2,ell3]: \ + "{ [c,i,j,k,ell,ell2]: \ 0 <= c < nels and \ 0 <= i < nbf and \ 0 <= j < nbf and \ diff --git a/test/test_domain.py b/test/test_domain.py index ebfde850907d68bebf06076fbf1c87d8bb093f71..5daf84eaa5b7ffd1647daf4b35acd7a5de91c5d1 100644 --- a/test/test_domain.py +++ b/test/test_domain.py @@ -111,7 +111,7 @@ def test_eq_constraint(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( - "{[i,j]: 0<= i,j < 32}", + "{[i]: 0<= i < 32}", [ "a[i] = b[i]" ], diff --git a/test/test_fortran.py b/test/test_fortran.py index e08033360d403f548d552108e6fd98b9117e19bd..3601e96b752f18e6e01bcfcffe49780bda4058b4 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -275,6 +275,12 @@ def test_tagged(ctx_factory): "i_inner,j_inner", ]) def test_matmul(ctx_factory, buffer_inames): + ctx = ctx_factory() + + if (buffer_inames and + ctx.devices[0].platform.name == "Portable Computing Language"): + pytest.skip("crashes on pocl") + logging.basicConfig(level=logging.INFO) fortran_src = """ @@ -316,7 +322,6 @@ def test_matmul(ctx_factory, buffer_inames): knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames, init_expression="0", store_expression="base+buffer") - ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128)) diff --git a/test/test_loopy.py b/test/test_loopy.py index 203ebb3922d3cc7f41b56abc31202b8974b88117..f9345d5b6cd9b97da80bb2ff8e5c6c657c199402 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -177,7 +177,7 @@ def test_simple_side_effect(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( - "{[i,j]: 0<=i,j<100}", + "{[i]: 0<=i<100}", """ a[i] = a[i] + 1 """, @@ -456,7 +456,7 @@ def test_nonlinear_index(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( - "{[i,j]: 0<=i,j src_ibox = source_boxes[isrc_box] @@ -769,7 +769,7 @@ def test_multiple_writes_to_local_temporary(): # writes are OK. knl = lp.make_kernel( - "{[i,e]: 0<=i<5 and 0<=e temp[i, 0] = 17 temp[i, 1] = 15 @@ -952,7 +952,7 @@ def test_atomic_init(dtype): vec_width = 4 knl = lp.make_kernel( - "{ [i,j]: 0<=i<100 }", + "{ [i]: 0<=i<100 }", """ out[i%4] = 0 {id=init, atomic=init} """, @@ -1555,7 +1555,7 @@ def test_finite_difference_expr_subst(ctx_factory): gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True, default_tag="l.auto") - precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"}) + precomp_knl = lp.tag_inames(precomp_knl, {"j_outer": "unr"}) precomp_knl = lp.set_options(precomp_knl, return_dict=True) evt, _ = precomp_knl(queue, u=u, h=h) @@ -1926,8 +1926,9 @@ def test_scalars_with_base_storage(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) + import islpy as isl knl = lp.make_kernel( - "{ [i]: 0<=i<1}", + [isl.BasicSet("[] -> {[]: }")], # empty (domain w/unused inames errors) "a = 1", [lp.TemporaryVariable("a", dtype=np.float64, shape=(), base_storage="base")]) diff --git a/test/test_misc.py b/test/test_misc.py index 7a834a6f5d393298e97df22d47a1de3b64354a42..dc5045fe0f7a3756d9a70a52d0a0c3dbb92f3e69 100644 --- a/test/test_misc.py +++ b/test/test_misc.py @@ -24,7 +24,6 @@ THE SOFTWARE. import six # noqa import pytest -from six.moves import range import sys @@ -35,50 +34,6 @@ logger = logging.getLogger(__name__) from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa -def test_compute_sccs(): - from loopy.tools import compute_sccs - import random - - rng = random.Random(0) - - def generate_random_graph(nnodes): - graph = dict((i, set()) for i in range(nnodes)) - for i in range(nnodes): - for j in range(nnodes): - # Edge probability 2/n: Generates decently interesting inputs. - if rng.randint(0, nnodes - 1) <= 1: - graph[i].add(j) - return graph - - def verify_sccs(graph, sccs): - visited = set() - - def visit(node): - if node in visited: - return [] - else: - visited.add(node) - result = [] - for child in graph[node]: - result = result + visit(child) - return result + [node] - - for scc in sccs: - scc = set(scc) - assert not scc & visited - # Check that starting from each element of the SCC results - # in the same set of reachable nodes. - for scc_root in scc: - visited.difference_update(scc) - result = visit(scc_root) - assert set(result) == scc, (set(result), scc) - - for nnodes in range(10, 20): - for i in range(40): - graph = generate_random_graph(nnodes) - verify_sccs(graph, compute_sccs(graph)) - - def test_SetTrie(): from loopy.kernel.tools import SetTrie diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py index e022e92f3712d984c1ad68061d0052240ff9d20c..54c64e0a4d4a23b429eb83be6c0a19f482a1b922 100644 --- a/test/test_sem_reagan.py +++ b/test/test_sem_reagan.py @@ -48,7 +48,7 @@ def test_tim2d(ctx_factory): # K - run-time symbolic knl = lp.make_kernel( - "{[i,j,e,m,o,o2,gi]: 0<=i,j,m,o,o2 {[]: }")], """ a, b = make_tuple(1, 2.) """) diff --git a/test/test_transform.py b/test/test_transform.py index cdc0c14b8bacc4fe5279d000461c0ea2244af021..ffef893b05fbca5a0d244ff17f379e1bb5cf27a1 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -74,7 +74,7 @@ def test_collect_common_factors(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( - "{[i,j,k]: 0<=i,j out_tmp = 0 {id=out_init,inames=i} out_tmp = out_tmp + alpha[i]*a[i,j]*b1[j] {id=out_up1,dep=out_init} @@ -385,7 +385,7 @@ def test_precompute_nested_subst(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( - "{[i,j]: 0<=i 1: exec(sys.argv[1])