diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000000000000000000000000000000000000..dcbc21d86f9e4b17ea7e8803d538c4c0f0b6276a --- /dev/null +++ b/.editorconfig @@ -0,0 +1,32 @@ +# https://editorconfig.org/ +# https://github.com/editorconfig/editorconfig-vim +# https://github.com/editorconfig/editorconfig-emacs + +root = true + +[*] +indent_style = space +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + +[*.py] +indent_size = 4 + +[*.rst] +indent_size = 4 + +[*.cpp] +indent_size = 2 + +[*.hpp] +indent_size = 2 + +# There may be one in doc/ +[Makefile] +indent_style = tab + +# https://github.com/microsoft/vscode/issues/1679 +[*.md] +trim_trailing_whitespace = false diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c85e22c12d78cb2e5a3ef753bc8baf4ee4cb3780..7d8101763de864e20bd92c6be0d1fef0e31d1b31 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,11 +18,11 @@ jobs: - uses: actions/setup-python@v1 with: - python-version: '3.x' + python-version: '3.x' - name: "Main Script" run: | curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh - . ./prepare-and-run-flake8.sh ./loopy ./test + . ./prepare-and-run-flake8.sh "$(basename $GITHUB_REPOSITORY)" ./test examples pylint: name: Pylint @@ -35,10 +35,10 @@ jobs: CONDA_ENVIRONMENT=.test-conda-env.yml USE_CONDA_BUILD=1 curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh - . ./prepare-and-run-pylint.sh loopy test/test_*.py + . ./prepare-and-run-pylint.sh "$(basename $GITHUB_REPOSITORY)" test/test_*.py - pytest3: - name: Conda Pytest Py3 + pytest: + name: Conda Pytest runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 @@ -48,29 +48,58 @@ jobs: curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh . ./build-and-test-py-project-within-miniconda.sh - pytest2: - name: Conda Pytest Py2 + pytest_twice: + name: Conda Pytest Twice (for cache behavior) runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: "Main Script" run: | - sed 's/python=3/python=2.7/' .test-conda-env-py3.yml > .test-conda-env-py2.yml - cat .test-conda-env-py2.yml - CONDA_ENVIRONMENT=.test-conda-env-py2.yml - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh - . ./build-and-test-py-project-within-miniconda.sh + CONDA_ENVIRONMENT=.test-conda-env-py3.yml + curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh + . ./ci-support.sh + build_py_project_in_conda_env + ( test_py_project ) + ( test_py_project ) - pytest_twice: - name: Pytest twice (for cache behavior) on Py${{ matrix.python-version }} + examples: + name: Conda Examples runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: "Main Script" run: | CONDA_ENVIRONMENT=.test-conda-env-py3.yml - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh - . ./build-and-test-py-project-within-miniconda.sh - ${PY_EXE} -m pytest -rw --durations=10 --tb=native --junitxml=pytest.xml -rxs $TESTABLES + EXTRA_INSTALL="matplotlib ipykernel nbconvert" + curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh + . ./ci-support.sh + build_py_project_in_conda_env + + curl -L "https://ci.appveyor.com/api/projects/ispc/ispc/artifacts/build%2Fispc-trunk-linux.tar.gz?job=Environment%3A%20APPVEYOR_BUILD_WORKER_IMAGE%3DUbuntu1604%2C%20LLVM_VERSION%3Dlatest" | tar xfz - + export PATH="$(pwd)/ispc-trunk-linux/bin:$PATH" + + export PYOPENCL_TEST=portable:pthread + + . ./build-py-project-and-run-examples.sh + run_py_examples + run_ipynb_examples + run_floopy_examples + + docs: + name: Documentation + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - + uses: actions/setup-python@v1 + with: + python-version: '3.x' + - name: "Main Script" + run: | + CONDA_ENVIRONMENT=.test-conda-env-py3.yml + curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh + . ci-support.sh + build_py_project_in_conda_env + build_docs # vim: sw=4 diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c11e507ee79cdc6f1567acbf6c12bbd7ed22f1cc..d69f0b8c489c07d3aa1512f6f1cbb8ced0f6a2e9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,26 +1,7 @@ -Python 2.7 POCL: - script: - - export PY_EXE=python2.7 - - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="pybind11 numpy mako" - - export LOOPY_NO_CACHE=1 - - export NO_DOCTESTS=1 - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - - ". ./build-and-test-py-project.sh" - tags: - - python2.7 - - pocl - except: - - tags - artifacts: - reports: - junit: test/pytest.xml - - Python 3 POCL: script: - export PY_EXE=python3 - - export PYOPENCL_TEST=portable + - export PYOPENCL_TEST=portable:pthread - export EXTRA_INSTALL="pybind11 numpy mako" - export LOOPY_NO_CACHE=1 - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh @@ -55,15 +36,15 @@ Python 3 Intel: Python 3 POCL Twice With Cache: - script: - - export PY_EXE=python3 - - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="pybind11 numpy mako" - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - - ". ./build-and-test-py-project.sh" - - "cd .." - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - - ". ./build-and-test-py-project.sh" + script: | + export PY_EXE=python3 + export PYOPENCL_TEST=portable:pthread + export EXTRA_INSTALL="pybind11 numpy mako" + curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh + . ./ci-support.sh + build_py_project_in_venv + ( test_py_project ) + ( test_py_project ) tags: - python3 - pocl @@ -77,7 +58,7 @@ Python 3 POCL Twice With Cache: # PyPy POCL: # script: # - export PY_EXE=pypy -# - export PYOPENCL_TEST=portable +# - export PYOPENCL_TEST=portable:pthread # - export EXTRA_INSTALL="pybind11 numpy mako" # - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh # - ". ./build-and-test-py-project.sh" @@ -88,16 +69,26 @@ Python 3 POCL Twice With Cache: # - tags Python 3 POCL Examples: - script: - - export PY_EXE=python3 - - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert" - - ". ./build-py-project-and-run-examples.sh" + script: | + export PY_EXE=python3 + export PYOPENCL_TEST=portable:pthread + export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert" + + curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh + . ./ci-support.sh + build_py_project_in_venv + + curl -L "https://ci.appveyor.com/api/projects/ispc/ispc/artifacts/build%2Fispc-trunk-linux.tar.gz?job=Environment%3A%20APPVEYOR_BUILD_WORKER_IMAGE%3DUbuntu1604%2C%20LLVM_VERSION%3Dlatest" | tar xfz - + export PATH="$(pwd)/ispc-trunk-linux/bin:$PATH" + + . ./build-py-project-and-run-examples.sh + run_py_examples + run_ipynb_examples + run_floopy_examples tags: - python3 - pocl - large-node - - ispc except: - tags @@ -108,26 +99,12 @@ Pylint: - export PY_EXE=python3 - EXTRA_INSTALL="pybind11 numpy mako matplotlib ipykernel ply fparser" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh - - ". ./prepare-and-run-pylint.sh loopy test/test_*.py" + - . ./prepare-and-run-pylint.sh "$CI_PROJECT_NAME" test/test_*.py tags: - python3 except: - tags -CentOS binary: - script: - - (cd build-helpers; ./make-linux-build-docker.sh --nodate) - - (cd ./build-helpers; ./loopy-centos6 ../examples/fortran/sparse.floopy) - artifacts: - expire_in: 4 weeks - paths: - - build-helpers/loopy-centos6 - tags: - - docker - only: - - master - retry: 2 - Documentation: script: - EXTRA_INSTALL="pybind11 numpy" @@ -135,13 +112,11 @@ Documentation: - ". ./build-docs.sh" tags: - python3 - only: - - master Flake8: script: - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh - - ". ./prepare-and-run-flake8.sh loopy test" + - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test examples tags: - python3 except: diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml index a1fe086b4ac4562aaa8fafd32657aebbd1068e8a..0688c79603a66aabd0e021855e543f751cd76542 100644 --- a/.test-conda-env-py3.yml +++ b/.test-conda-env-py3.yml @@ -1,12 +1,12 @@ name: test-conda-env channels: - conda-forge -- defaults +- nodefaults dependencies: - python=3 - git -- conda-forge::numpy +- numpy - pocl - mako - pyopencl @@ -16,13 +16,3 @@ dependencies: - matplotlib - ipykernel - ply - -- pip - -- pip: - - git+https://github.com/inducer/pytools.git - - git+https://github.com/inducer/cgen.git - - git+https://github.com/inducer/pymbolic.git - - git+https://github.com/inducer/genpy.git - - git+https://github.com/inducer/codepy.git - - git+https://github.com/inducer/f2py diff --git a/MANIFEST.in b/MANIFEST.in index 119fb6a1dda0b5b9efd95c5908da4d3563e6a543..293d43ffc8130de870932cc17db18ebe35fd0058 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,7 +1,7 @@ include test/*.py include test/*.f90 -recursive-include examples *.py *.cl *.floopy *.sh *.ipynb *.cpp -recursive-include contrib *.vim +recursive-include examples *.py *.cl *.floopy *.sh *.ipynb *.cpp *.loopy +recursive-include contrib *.vim *.py include build-helpers/*.sh include build-helpers/*.spec @@ -18,4 +18,5 @@ include doc/images/*.png include configure.py include Makefile.in include README.rst +include LICENSE include requirements*.txt diff --git a/README.rst b/README.rst index 4aa93e0888a0063c6c0af2a2c8916b85018b182e..46204c29e166f86e170441ee1b54fc51def5f844 100644 --- a/README.rst +++ b/README.rst @@ -4,12 +4,12 @@ Loopy: Transformation-Based Generation of High-Performance CPU/GPU Code .. image:: https://gitlab.tiker.net/inducer/loopy/badges/master/pipeline.svg :alt: Gitlab Build Status :target: https://gitlab.tiker.net/inducer/loopy/commits/master -.. image:: https://github.com/inducer/loopy/workflows/CI/badge.svg?branch=master +.. image:: https://github.com/inducer/loopy/workflows/CI/badge.svg?branch=master&event=push :alt: Github Build Status - :target: https://github.com/inducer/loopy/actions?query=branch%3Amaster+workflow%3ACI -.. image:: https://badge.fury.io/py/loo.py.png + :target: https://github.com/inducer/loopy/actions?query=branch%3Amaster+workflow%3ACI+event%3Apush +.. image:: https://badge.fury.io/py/loopy.png :alt: Python Package Index Release Page - :target: https://pypi.org/project/loo.py/ + :target: https://pypi.org/project/loopy/ Loopy lets you easily generate the tedious, complicated code that is necessary to get good performance out of GPUs and multi-core CPUs. @@ -38,23 +38,21 @@ Loopy targets array-type computations, such as the following: It is not (and does not want to be) a general-purpose programming language. Loopy is licensed under the liberal `MIT license -`_ and free for commercial, academic, +`_ and free for commercial, academic, and private use. All of Loopy's dependencies can be automatically installed from the package index after using:: - pip install loo.py + pip install loopy In addition, Loopy is compatible with and enhances -`pyopencl `_. +`pyopencl `_. --- Places on the web related to Loopy: -* `Python package index `_ (download releases) Note the extra '.' in the PyPI identifier! - -* `Documentation `_ (read how things work) -* `Github `_ (get latest source code, file bugs) -* `Wiki `_ (read installation tips, get examples, read FAQ) -* `Homepage `_ +* `Python package index `_ (download releases) +* `Documentation `_ (read how things work) +* `Github `_ (get latest source code, file bugs) +* `Homepage `_ diff --git a/build-helpers/.gitignore b/build-helpers/.gitignore deleted file mode 100644 index fef83014eecb14936006b90afc65595dd7d30b77..0000000000000000000000000000000000000000 --- a/build-helpers/.gitignore +++ /dev/null @@ -1 +0,0 @@ -loopy-*-20[0-9][0-9]* diff --git a/build-helpers/loopy.spec b/build-helpers/loopy.spec deleted file mode 100644 index 08c0b6efe0efd3ad419b6565fd396c2f805eeab7..0000000000000000000000000000000000000000 --- a/build-helpers/loopy.spec +++ /dev/null @@ -1,70 +0,0 @@ -# -*- mode: python -*- - -from os.path import basename, dirname, join -from glob import glob - -single_file = True - -# This makes the executable spew debug info. -debug = False - -from os.path import expanduser - -import packaging # pip install packaging to add - -a = Analysis(['../bin/loopy'], - pathex=[expanduser('~/src/loopy')], - hiddenimports=[ - "decorator", - "appdirs", - "packaging.markers", - "packaging.specifiers", - "packaging.version", - "packaging.requirements", - ], - hookspath=None, - runtime_hooks=None, - excludes=["hedge", "meshpy", "pyopencl", "PIL"] - ) - -import ply.lex -import ply.yacc - - -a.datas += [ - (join("py-src", "ply", "lex", basename(fn)), fn, "DATA") - for fn in glob(join(dirname(ply.lex.__file__), "*.py")) - ] + [ - (join("py-src", "ply", "yacc", basename(fn)), fn, "DATA") - for fn in glob(join(dirname(ply.yacc.__file__), "*.py")) - ] - -pyz = PYZ(a.pure) - -if single_file: - exe = EXE(pyz, - a.scripts, - a.binaries, - a.zipfiles, - a.datas, - name='loopy', - debug=debug, - strip=None, - upx=True, - console=True) -else: - exe = EXE(pyz, - a.scripts, - exclude_binaries=True, - name='loopy', - debug=debug, - strip=None, - upx=True, - console=True) - coll = COLLECT(exe, - a.binaries, - a.zipfiles, - a.datas, - strip=None, - upx=True, - name='loopy') diff --git a/build-helpers/make-linux-build-docker-inner-part-2.sh b/build-helpers/make-linux-build-docker-inner-part-2.sh deleted file mode 100755 index 035634b16072e0188270abd8736dab99ce31dada..0000000000000000000000000000000000000000 --- a/build-helpers/make-linux-build-docker-inner-part-2.sh +++ /dev/null @@ -1,35 +0,0 @@ -#! /bin/bash - -set -e -set -x - -VENV_VERSION="virtualenv-15.2.0" -rm -Rf "$VENV_VERSION" -curl -k https://files.pythonhosted.org/packages/b1/72/2d70c5a1de409ceb3a27ff2ec007ecdd5cc52239e7c74990e32af57affe9/$VENV_VERSION.tar.gz | tar xfz - - -$VENV_VERSION/virtualenv.py --system-site-packages --no-setuptools .env - -source .env/bin/activate - -curl -k https://bootstrap.pypa.io/ez_setup.py | python - -curl -k https://gitlab.tiker.net/inducer/pip/raw/7.0.3/contrib/get-pip.py | python - - -pip install packaging - -PYTHON_VER=$(python -c 'import sys; print(".".join(str(s) for s in sys.version_info[:2]))') -pip install git+https://github.com/pyinstaller/pyinstaller.git@413c37bec126c0bd26084813593f65128966b4b7 - -git clone --recursive git://github.com/inducer/loopy -cd loopy - -grep -v pyopencl requirements.txt > myreq.txt - -# needed for pyinstaller package to be usable -echo packaging >> myreq.txt - -pip install -r myreq.txt -python setup.py install - -chown -R user /tmp/build - -su user -p -c "cd /tmp/build && source .env/bin/activate && cd loopy && ./build-helpers/run-pyinstaller.sh" diff --git a/build-helpers/make-linux-build-docker-inner.sh b/build-helpers/make-linux-build-docker-inner.sh deleted file mode 100755 index a7f621b1ef21676898d2283d93f8a54f086e5d9d..0000000000000000000000000000000000000000 --- a/build-helpers/make-linux-build-docker-inner.sh +++ /dev/null @@ -1,15 +0,0 @@ -#! /bin/bash - -set -e -set -x - -mkdir /tmp/build -cd /tmp/build - -useradd -d /home/user -m -s /bin/bash user - -yum install -y centos-release-scl -yum install -y git python27 python27-python-devel python27-numpy tar gcc gcc-c++ mercurial libffi-devel - -scl enable python27 /mnt/make-linux-build-docker-inner-part-2.sh - diff --git a/build-helpers/make-linux-build-docker.sh b/build-helpers/make-linux-build-docker.sh deleted file mode 100755 index fb0cfb587d654698800bfdc827259691bc056fb7..0000000000000000000000000000000000000000 --- a/build-helpers/make-linux-build-docker.sh +++ /dev/null @@ -1,28 +0,0 @@ -#! /bin/bash - -# should be run in this directory (build-helpers) - -if test "$1" = "--nodate"; then - TGT_NAME=loopy-centos6 -else - TGT_NAME=loopy-centos6-$(date +"%Y-%m-%d") -fi - -echo "Generating $TGT_NAME..." - -set -e -set -x - -docker pull centos:6 - -CNT=$(docker create -t -v $(pwd):/mnt centos:6 /mnt/make-linux-build-docker-inner.sh) -echo "working in container $CNT" - -docker start -i $CNT - -docker cp $CNT:/tmp/build/loopy/dist/loopy $(pwd) || true - -mv loopy $TGT_NAME - -docker rm $CNT - diff --git a/build-helpers/run-pyinstaller.sh b/build-helpers/run-pyinstaller.sh deleted file mode 100755 index 50f9d85dccc503be2a2ccfb6c0e3d6aa28216981..0000000000000000000000000000000000000000 --- a/build-helpers/run-pyinstaller.sh +++ /dev/null @@ -1,9 +0,0 @@ -#! /bin/bash - -# run this from the loopy root directory - -rm -Rf dist build - -pyinstaller \ - --workpath=build/pyinstaller \ - build-helpers/loopy.spec diff --git a/build-helpers/upload.sh b/build-helpers/upload.sh deleted file mode 100755 index 57b8a873b9395954d76a8fd16f8ca9a261e8baa3..0000000000000000000000000000000000000000 --- a/build-helpers/upload.sh +++ /dev/null @@ -1,5 +0,0 @@ -#! /bin/bash - -set -e - -scp "$1" tiker.net:public_html/pub/loopy-binaries/ diff --git a/build-py-project-and-run-examples.sh b/build-py-project-and-run-examples.sh index e51a86d2085364ca142f5bfde3380a9fade0de01..a3ddf75875a657bdd7134d0580f6bdabfd2af25d 100644 --- a/build-py-project-and-run-examples.sh +++ b/build-py-project-and-run-examples.sh @@ -2,9 +2,6 @@ set -e -curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-py-project.sh -source build-py-project.sh - function run_examples() { PATTERN=$1 @@ -25,13 +22,10 @@ function run_py_examples() } function run_ipynb_examples() { - run_examples "*.ipynb" "${PY_EXE} -m nbconvert --execute" + run_examples "*.ipynb" "${PY_EXE} -m nbconvert --to html --execute" } function run_floopy_examples() { run_examples "*.floopy" "${PY_EXE} -m loopy" } -run_py_examples -run_ipynb_examples -run_floopy_examples diff --git a/contrib/c-integer-semantics.py b/contrib/c-integer-semantics.py index 5e05ec6884c3c6b5b6c58d0080c6c0a52b91e2e4..23c7cb319177b762e83583e7bb5ea3eecd1d46da 100644 --- a/contrib/c-integer-semantics.py +++ b/contrib/c-integer-semantics.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# coding: utf-8 from os import system import ctypes diff --git a/doc/conf.py b/doc/conf.py index a2807b076f562abf8b9250f64e4ea7c16073a7b8..942afcd3ce11056c65c6a7500bb5ed312dc40187 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # loopy documentation build configuration file, created by # sphinx-quickstart on Tue Aug 9 13:40:49 2011. @@ -46,8 +45,8 @@ source_suffix = '.rst' master_doc = 'index' # General information about the project. -project = u'loopy' -copyright = u'2016, Andreas Klöckner' +project = 'loopy' +copyright = '2016, Andreas Klöckner' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -108,7 +107,7 @@ html_theme = "alabaster" html_theme_options = { "extra_nav_links": { "🚀 Github": "https://github.com/inducer/loopy", - "💾 Download Releases": "https://pypi.python.org/pypi/loo.py", + "💾 Download Releases": "https://pypi.org/project/loopy", } } @@ -148,7 +147,7 @@ html_sidebars = { # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +# html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. @@ -206,8 +205,8 @@ htmlhelp_basename = 'loopydoc' # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'loopy.tex', u'loopy Documentation', - u'Andreas Kloeckner', 'manual'), + ('index', 'loopy.tex', 'loopy Documentation', + 'Andreas Kloeckner', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of @@ -239,8 +238,8 @@ latex_documents = [ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'loopy', u'loopy Documentation', - [u'Andreas Kloeckner'], 1) + ('index', 'loopy', 'loopy Documentation', + ['Andreas Kloeckner'], 1) ] @@ -251,6 +250,8 @@ intersphinx_mapping = { 'https://documen.tician.de/pyopencl': None, 'https://documen.tician.de/cgen': None, 'https://docs.scipy.org/doc/numpy/': None, + 'https://documen.tician.de/pymbolic': None, + 'https://documen.tician.de/pytools': None, } autoclass_content = "class" diff --git a/doc/index.rst b/doc/index.rst index 9a10116d916468fd46b9b23ad113f3d9085ae699..1c64134a34086b59f9b0dd1a7010e49f037b751f 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -20,43 +20,18 @@ When you run this script, the following kernel is generated, compiled, and execu .. _static-binary: -Want to try out loopy? ----------------------- - -There's no need to go through :ref:`installation` if you'd just like to get a -feel for what loopy is. Instead, you may -`download a self-contained Linux binary `_. -This is purposefully built on an ancient Linux distribution, so it should work -on most versions of Linux that are currently out there. - -Once you have the binary, do the following:: - - chmod +x ./loopy-centos6 - ./loopy-centos6 --target=opencl hello-loopy.loopy - ./loopy-centos6 --target=cuda hello-loopy.loopy - ./loopy-centos6 --target=ispc hello-loopy.loopy - -Grab the example here: :download:`examples/python/hello-loopy.loopy <../examples/python/hello-loopy.loopy>`. - -You may also donwload the most recent version by going to the `list of builds -`_, clicking on the newest one -of type "CentOS binary", clicking on "Browse" under "Build Artifacts", then -navigating to "build-helpers", and downloading the binary from there. - Places on the web related to Loopy ---------------------------------- -* `Python package index `_ (download releases) Note the extra '.' in the PyPI identifier! - -* `Github `_ (get latest source code, file bugs) -* `Wiki `_ (read installation tips, get examples, read FAQ) -* `Homepage `_ +* `Python package index `_ (download releases) +* `Github `_ (get latest source code, file bugs) +* `Homepage `_ Table of Contents ----------------- If you're only just learning about loopy, consider the following `paper -`_ on loo.py that may serve as a good +`_ on loopy that may serve as a good introduction. Please check :ref:`installation` to get started. @@ -71,6 +46,7 @@ Please check :ref:`installation` to get started. ref_call ref_other misc + ref_internals Indices and tables ================== diff --git a/doc/misc.rst b/doc/misc.rst index 62e5a1fa20f2709c4933e21f43175fc1f870c348..4c8c9867f3ceee2447f9249097c7c30f4d6f501d 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -17,9 +17,7 @@ Option 1: From Source, no PyOpenCL integration This command should install :mod:`loopy`:: - pip install loo.py - -(Note the extra "."!) + pip install loopy You may need to run this with :command:`sudo`. If you don't already have `pip `_, @@ -29,14 +27,13 @@ run this beforehand:: python get-pip.py For a more manual installation, `download the source -`_, unpack it, and say:: +`_, unpack it, and say:: python setup.py install You may also clone its git repository:: - git clone --recursive git://github.com/inducer/loopy - git clone --recursive http://git.tiker.net/trees/loopy.git + git clone --recursive https://github.com/inducer/loopy.git Option 2: From Conda Forge, with PyOpenCL integration ----------------------------------------------------- @@ -256,7 +253,7 @@ This list is always growing, but here are a few pointers: * Precompute subexpressions: Use a :ref:`substitution-rule` to assign a name to a subexpression, - using may be :func:`loopy.assignment_to_subst` or :func:`extract_subst`. + using may be :func:`loopy.assignment_to_subst` or :func:`loopy.extract_subst`. Then use :func:`loopy.precompute` to create an (array or scalar) temporary with precomputed values. @@ -295,12 +292,12 @@ This list is always growing, but here are a few pointers: Use :func:`loopy.tag_inames` with the ``"vec"`` iname tag. Note that the corresponding axis of an array must also be tagged using the ``"vec"`` array axis tag - (using :func:`tag_array_axes`) in order for vector code to be + (using :func:`loopy.tag_array_axes`) in order for vector code to be generated. Vectorized loops (and array axes) must have a fixed size. (See either - :func:`split_iname` or :func:`fix_parameters` along with - :func:`split_array_axis`.) + :func:`loopy.split_iname` or :func:`loopy.fix_parameters` along with + :func:`loopy.split_array_axis`.) * Reuse of Temporary Storage @@ -309,7 +306,7 @@ This list is always growing, but here are a few pointers: * SoA $\leftrightarrow$ AoS - Use :func:`tag_array_axes` with the ``"sep"`` array axis tag + Use :func:`loopy.tag_array_axes` with the ``"sep"`` array axis tag to generate separate arrays for each entry of a short, fixed-length array axis. @@ -320,7 +317,7 @@ This list is always growing, but here are a few pointers: Use :func:`loopy.tag_inames` with the ``"ilp"`` tag. ILP loops must have a fixed size. (See either - :func:`split_iname` or :func:`fix_parameters`.) + :func:`loopy.split_iname` or :func:`loopy.fix_parameters`.) * Type inference @@ -445,7 +442,7 @@ If you use loopy for your work and find its approach helpful, please consider citing the following article. A. Klöckner. `Loo.py: transformation-based code generation for GPUs and - CPUs `_. Proceedings of ARRAY '14: ACM + CPUs `_. Proceedings of ARRAY '14: ACM SIGPLAN Workshop on Libraries, Languages, and Compilers for Array Programming. Edinburgh, Scotland. @@ -478,3 +475,16 @@ Andreas Klöckner's work on :mod:`loopy` was supported in part by AK also gratefully acknowledges a hardware gift from Nvidia Corporation. The views and opinions expressed herein do not necessarily reflect those of the funding agencies. + +Cross-References to Other Documentation +======================================= + +.. currentmodule:: numpy + +.. class:: int16 + + See :class:`numpy.generic`. + +.. class:: complex128 + + See :class:`numpy.generic`. diff --git a/doc/ref_creation.rst b/doc/ref_creation.rst index 6b715033cce60fa3a369f2abc4edbecbf4c9a0d3..05e0edb88245086cabea806e5aa108fa6688a9a8 100644 --- a/doc/ref_creation.rst +++ b/doc/ref_creation.rst @@ -1,6 +1,4 @@ -.. module:: loopy -.. moduleauthor:: Andreas Kloeckner - +.. currentmodule:: loopy .. _creating-kernels: Reference: Creating Kernels diff --git a/doc/ref_internals.rst b/doc/ref_internals.rst new file mode 100644 index 0000000000000000000000000000000000000000..3dc0a2bd7306e4b7e68d44e5956fe69e32c9c97f --- /dev/null +++ b/doc/ref_internals.rst @@ -0,0 +1,55 @@ +Reference: Documentation for Internal API +========================================= + +Targets +------- + +See also :ref:`targets`. + +.. automodule:: loopy.target.c + +Symbolic +-------- + +See also :ref:`expression-syntax`. + +.. automodule:: loopy.symbolic + +Types +----- + +DTypes of variables in a :class:`loopy.LoopKernel` must be picklable, so in +the codegen pipeline user-provided types are converted to +:class:`loopy.types.LoopyType`. + +.. automodule:: loopy.types + +Codegen +------- + +.. automodule:: loopy.codegen + +Reduction Operation +------------------- + +.. automodule:: loopy.library.reduction + +Iname Tags +---------- + +.. automodule:: loopy.kernel.data + +Array +----- + +.. automodule:: loopy.kernel.array + +Checks +------ + +.. automodule:: loopy.check + +Schedule +-------- + +.. automodule:: loopy.schedule diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst index 409cbef576d654be973dd6d1424ac40d3ea60982..d339e1b19caae740401c5b98ffbf8927d2477551 100644 --- a/doc/ref_kernel.rst +++ b/doc/ref_kernel.rst @@ -151,6 +151,42 @@ Tag Meaning .. }}} +Identifiers +----------- + +Reserved Identifiers +^^^^^^^^^^^^^^^^^^^^ + +The identifier prefix ``_lp_`` is reserved for internal usage; when creating +*inames*, *argument names*, *temporary variable names*, *substitution rule +names*, *instruction IDs*, and other identifiers, users should *not* use names +beginning with ``_lp_``. This prefix is used for identifiers created +internally when operating on Loopy's kernel IR. For Loopy developers, further +information on name prefixes used within submodules is below. + +Identifier Registry +^^^^^^^^^^^^^^^^^^^ + +Functionality in :mod:`loopy` *must* use identifiers beginning with ``_lp_`` for +all internally-created identifiers. Additionally, each name beginning with +``_lp_`` must start with one of the reserved prefixes below. New prefixes may +be registered by adding them to the table below. New prefixes may not themselves +be the prefix of an existing prefix. + +**Reserved Identifier Prefixes** + +======================= ================================== +Reserved Prefix Usage (module or purpose) +======================= ================================== +``_lp_linchk_`` ``loopy.linearization.checker`` +======================= ================================== + +.. note:: + + Existing Loopy code may not yet fully satisfy these naming requirements. + Name changes are in progress, and prefixes will be added to this registry + as they are created. + .. _instructions: Instructions @@ -358,6 +394,7 @@ TODO: Reductions Function Call Instructions ^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: loopy .. autoclass:: CallInstruction C Block Instructions @@ -374,6 +411,8 @@ Atomic Operations .. autoclass:: VarAtomicity +.. autoclass:: OrderedAtomic + .. autoclass:: AtomicInit .. autoclass:: AtomicUpdate @@ -431,7 +470,7 @@ Temporary Variables Temporary variables model OpenCL's ``private`` and ``local`` address spaces. Both have the lifetime of a kernel invocation. -.. autoclass:: temp_var_scope +.. autoclass:: AddressSpace .. autoclass:: TemporaryVariable :members: @@ -597,8 +636,8 @@ Do not create :class:`LoopKernel` objects directly. Instead, refer to :members: :undoc-members: -Implementation Detail: The Base Array -------------------------------------- +Implementation Details: The Base Array +-------------------------------------- All array-like data in :mod:`loopy` (such as :class:`ArrayArg` and :class:`TemporaryVariable`) derive from single, shared base array type, @@ -608,4 +647,5 @@ described next. .. autoclass:: ArrayBase + .. vim: tw=75:spell:fdm=marker diff --git a/doc/ref_other.rst b/doc/ref_other.rst index 71d6c54b11dcd15977bdb375cea2207d881b5696..64367c752ba4bfa24cb5957a950f67db701966de 100644 --- a/doc/ref_other.rst +++ b/doc/ref_other.rst @@ -16,7 +16,7 @@ Controlling caching Running Kernels --------------- -In addition to simply calling kernels using :class:`LoopKernel.__call__`, +In addition to simply calling kernels using :meth:`LoopKernel.__call__`, the following underlying functionality may be used: .. autoclass:: CompiledKernel diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 565f81db46c4ffbee805bbb1f4e34419d2d6b049..1e1489d372e51dc7bd3bcc3ee43c5f7620df4ea9 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -179,11 +179,11 @@ for good measure. >>> assert (out.get() == (2*x_vec_dev).get()).all() We can have loopy print the OpenCL kernel it generated -by passing :attr:`loopy.Options.write_cl`. +by passing :attr:`loopy.Options.write_code`. .. doctest:: - >>> knl = lp.set_options(knl, "write_cl") + >>> knl = lp.set_options(knl, "write_code") >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) @@ -227,7 +227,7 @@ inspect that code, too, using :attr:`loopy.Options.write_wrapper`: .. doctest:: - >>> knl = lp.set_options(knl, write_wrapper=True, write_cl=False) + >>> knl = lp.set_options(knl, write_wrapper=True, write_code=False) >>> evt, (out,) = knl(queue, a=x_vec_host) from __future__ import division ... @@ -246,18 +246,26 @@ inspect that code, too, using :attr:`loopy.Options.write_wrapper`: # }}} ... +You can also pass options to the OpenCL implementation +by passing :attr:`loopy.Options.build_options`. + +.. doctest:: + + >>> knl = lp.set_options(knl, build_options=["-cl-mad-enable"]) + + Generating code ~~~~~~~~~~~~~~~ Instead of using loopy to run the code it generates, you can also just use loopy as a code generator and take care of executing the generated kernels yourself. In this case, make sure loopy knows about all types, and then -call :func:`loopy.generate_code`: +call :func:`loopy.generate_code_v2`: .. doctest:: >>> typed_knl = lp.add_dtypes(knl, dict(a=np.float32)) - >>> code, _ = lp.generate_code(typed_knl) + >>> code = lp.generate_code_v2(typed_knl).device_code() >>> print(code) #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) @@ -355,7 +363,7 @@ loopy can also show an instruction dependency graph, using Dependencies are shown as arrows from prerequisite to dependent in the graph. This functionality requires the open-source `graphviz -`_ graph drawing tools to be installed. The generated +`_ graph drawing tools to be installed. The generated graph will open in a browser window. Since manually notating lots of dependencies is cumbersome, loopy has @@ -380,7 +388,7 @@ Let us take a look at the generated code for the above kernel: .. doctest:: - >>> knl = lp.set_options(knl, "write_cl") + >>> knl = lp.set_options(knl, "write_code") >>> knl = lp.prioritize_loops(knl, "i,j") >>> evt, (out,) = knl(queue, a=a_mat_dev) #define lid(N) ((int) get_local_id(N)) @@ -430,7 +438,7 @@ Now the intended code is generated and our test passes. .. doctest:: - >>> knl = lp.set_options(knl, "write_cl") + >>> knl = lp.set_options(knl, "write_code") >>> evt, (out,) = knl(queue, a=a_mat_dev) #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) @@ -485,7 +493,7 @@ ambiguous. .. doctest:: - >>> knl = lp.set_options(knl, "write_cl") + >>> knl = lp.set_options(knl, "write_code") >>> evt, (out,) = knl(queue, a=a_mat_dev) #define lid(N) ((int) get_local_id(N)) ... @@ -523,7 +531,7 @@ is overwritten with the new kernel:: knl = lp.do_something(knl, arguments...) We've already seen an example of a transformation above: -For instance, :func:`prioritize_loops` fit the pattern. +For instance, :func:`loopy.prioritize_loops` fit the pattern. :func:`loopy.split_iname` is another fundamental (and useful) transformation. It turns one existing iname (recall that this is loopy's word for a 'loop @@ -543,7 +551,7 @@ Consider this example: ... "a[i] = 0", assumptions="n>=1") >>> knl = lp.split_iname(knl, "i", 16) >>> knl = lp.prioritize_loops(knl, "i_outer,i_inner") - >>> knl = lp.set_options(knl, "write_cl") + >>> knl = lp.set_options(knl, "write_code") >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... @@ -574,7 +582,7 @@ relation to loop nesting. For example, it's perfectly possible to request ... "a[i] = 0", assumptions="n>=1") >>> knl = lp.split_iname(knl, "i", 16) >>> knl = lp.prioritize_loops(knl, "i_inner,i_outer") - >>> knl = lp.set_options(knl, "write_cl") + >>> knl = lp.set_options(knl, "write_code") >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... @@ -599,7 +607,7 @@ commonly called 'loop tiling': >>> knl = lp.split_iname(knl, "i", 16) >>> knl = lp.split_iname(knl, "j", 16) >>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner") - >>> knl = lp.set_options(knl, "write_cl") + >>> knl = lp.set_options(knl, "write_code") >>> evt, (out,) = knl(queue, a=a_mat_dev) #define lid(N) ((int) get_local_id(N)) ... @@ -641,7 +649,7 @@ loop's tag to ``"unr"``: >>> knl = lp.split_iname(knl, "i", 4) >>> knl = lp.tag_inames(knl, dict(i_inner="unr")) >>> knl = lp.prioritize_loops(knl, "i_outer,i_inner") - >>> knl = lp.set_options(knl, "write_cl") + >>> knl = lp.set_options(knl, "write_code") >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) @@ -716,7 +724,7 @@ Let's try this out on our vector fill kernel by creating workgroups of size ... "a[i] = 0", assumptions="n>=0") >>> knl = lp.split_iname(knl, "i", 128, ... outer_tag="g.0", inner_tag="l.0") - >>> knl = lp.set_options(knl, "write_cl") + >>> knl = lp.set_options(knl, "write_code") >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... @@ -762,7 +770,7 @@ assumption: >>> knl = lp.split_iname(knl, "i", 4) >>> knl = lp.tag_inames(knl, dict(i_inner="unr")) >>> knl = lp.prioritize_loops(knl, "i_outer,i_inner") - >>> knl = lp.set_options(knl, "write_cl") + >>> knl = lp.set_options(knl, "write_code") >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... @@ -781,7 +789,7 @@ assumption: While these conditionals enable the generated code to deal with arbitrary *n*, they come at a performance cost. Loopy allows generating separate code for the last iteration of the *i_outer* loop, by using the *slabs* keyword -argument to :func:`split_iname`. Since this last iteration of *i_outer* is +argument to :func:`loopy.split_iname`. Since this last iteration of *i_outer* is the only iteration for which ``i_inner + 4*i_outer`` can become larger than *n*, only the (now separate) code for that iteration contains conditionals, enabling some cost savings: @@ -790,7 +798,7 @@ enabling some cost savings: >>> knl = orig_knl >>> knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="unr") - >>> knl = lp.set_options(knl, "write_cl") + >>> knl = lp.set_options(knl, "write_code") >>> knl = lp.prioritize_loops(knl, "i_outer,i_inner") >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) @@ -886,7 +894,7 @@ memory, local to each work item. .. doctest:: - >>> knl = lp.set_options(knl, "write_cl") + >>> knl = lp.set_options(knl, "write_code") >>> evt, (out1, out2) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... @@ -947,7 +955,7 @@ Consider the following example: ... """) >>> knl = lp.tag_inames(knl, dict(i_outer="g.0", i_inner="l.0")) >>> knl = lp.set_temporary_scope(knl, "a_temp", "local") - >>> knl = lp.set_options(knl, "write_cl") + >>> knl = lp.set_options(knl, "write_code") >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... @@ -1012,7 +1020,7 @@ transformation exists in :func:`loopy.add_prefetch`: ... out[16*i_outer + i_inner] = sum(k, a[16*i_outer + i_inner]) ... """) >>> knl = lp.tag_inames(knl, dict(i_outer="g.0", i_inner="l.0")) - >>> knl = lp.set_options(knl, "write_cl") + >>> knl = lp.set_options(knl, "write_code") >>> knl_pf = lp.add_prefetch(knl, "a") >>> evt, (out,) = knl_pf(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) @@ -1110,7 +1118,7 @@ work item: * *Local barriers* ensure consistency of memory accesses to items within *the same* work group. This synchronizes with all instructions in the work group. The type of memory (local or global) may be specified by the - :attr:`loopy.instruction.BarrierInstruction.mem_kind` + :attr:`loopy.BarrierInstruction.mem_kind` * *Global barriers* ensure consistency of memory accesses across *all* work groups, i.e. it synchronizes with every work item @@ -1360,7 +1368,7 @@ a loopy kernel by simply calling them, e.g.:: Additionally, all functions of one variable are currently recognized during code-generation however additional implementation may be required for custom functions. The full lists of available functions may be found in a the -:class:`TargetBase` implementation (e.g. :class:`CudaTarget`) +:class:`loopy.TargetBase` implementation (e.g. :class:`loopy.CudaTarget`) Custom user functions may be represented using the method described in :ref:`functions` @@ -1470,7 +1478,7 @@ When we ask to see the code, the issue becomes apparent: .. doctest:: - >>> knl = lp.set_options(knl, "write_cl") + >>> knl = lp.set_options(knl, "write_code") >>> from warnings import catch_warnings >>> with catch_warnings(): ... filterwarnings("always", category=lp.LoopyWarning) @@ -1568,13 +1576,13 @@ number of operations matching the characteristics of the :class:`loopy.Op` specified in the key (in terms of the :class:`loopy.LoopKernel` *inames*). :class:`loopy.Op` attributes include: -- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the +- dtype: A :class:`loopy.types.LoopyType` or :class:`numpy.dtype` that specifies the data type operated on. - name: A :class:`str` that specifies the kind of arithmetic operation as *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc. -One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`: +One way to evaluate these polynomials is with :meth:`islpy.PwQPolynomial.eval_with_dict`: .. doctest:: @@ -1659,7 +1667,7 @@ Each line of output will look roughly like:: - mtype: A :class:`str` that specifies the memory type accessed as **global** or **local** -- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the +- dtype: A :class:`loopy.types.LoopyType` or :class:`numpy.dtype` that specifies the data type accessed. - lid_strides: A :class:`dict` of **{** :class:`int` **:** @@ -1681,7 +1689,7 @@ Each line of output will look roughly like:: - variable: A :class:`str` that specifies the variable name of the data accessed. -We can evaluate these polynomials using :func:`islpy.eval_with_dict`: +We can evaluate these polynomials using :meth:`islpy.PwQPolynomial.eval_with_dict`: .. doctest:: @@ -1850,7 +1858,7 @@ kernel from the previous example: Sync(kernel_launch, loopy_kernel) : [l, m, n] -> { 1 } -We can evaluate this polynomial using :func:`islpy.eval_with_dict`: +We can evaluate this polynomial using :meth:`islpy.PwQPolynomial.eval_with_dict`: .. doctest:: @@ -1915,7 +1923,7 @@ Based on the kernel code printed above, we would expect each work-item to encounter 50x10x2 barriers, which matches the result from :func:`loopy.get_synchronization_map`. In this case, the number of barriers does not depend on any inames, so we can pass an empty dictionary to -:func:`islpy.eval_with_dict`. +:meth:`islpy.PwQPolynomial.eval_with_dict`. .. }}} diff --git a/examples/fortran/matmul-driver.py b/examples/fortran/matmul-driver.py new file mode 100644 index 0000000000000000000000000000000000000000..111ac241198581a75ad42d91f9db8e4e89a3cbf2 --- /dev/null +++ b/examples/fortran/matmul-driver.py @@ -0,0 +1,35 @@ +import numpy as np +import numpy.linalg as la +import pyopencl as cl +import pyopencl.array +import pyopencl.clrandom +import loopy as lp + + +def main(): + fn = "matmul.floopy" + with open(fn) as inf: + source = inf.read() + + dgemm, = lp.parse_transformed_fortran(source, filename=fn) + + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + + n = 2048 + a = cl.array.empty(queue, (n, n), dtype=np.float64, order="F") + b = cl.array.empty(queue, (n, n), dtype=np.float64, order="F") + c = cl.array.zeros(queue, (n, n), dtype=np.float64, order="F") + cl.clrandom.fill_rand(a) + cl.clrandom.fill_rand(b) + + dgemm = lp.set_options(dgemm, write_code=True) + + dgemm(queue, a=a, b=b, alpha=1, c=c) + + c_ref = (a.get() @ b.get()) + assert la.norm(c_ref - c.get())/la.norm(c_ref) < 1e-10 + + +if __name__ == "__main__": + main() diff --git a/examples/fortran/matmul.floopy b/examples/fortran/matmul.floopy index a8377beddb912a2d6b1d9255694336313089a0f9..733cdaac4d9153803dcb54d5c114a33871403bbf 100644 --- a/examples/fortran/matmul.floopy +++ b/examples/fortran/matmul.floopy @@ -22,7 +22,11 @@ end subroutine ! ! dgemm = lp.extract_subst(dgemm, "a_acc", "a[i1,i2]", parameters="i1, i2") ! dgemm = lp.extract_subst(dgemm, "b_acc", "b[i1,i2]", parameters="i1, i2") -! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner", default_tag="l.auto") -! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner", default_tag="l.auto") +! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner", +! precompute_outer_inames="i_outer, j_outer, k_outer", +! default_tag="l.auto") +! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner", +! precompute_outer_inames="i_outer, j_outer, k_outer", +! default_tag="l.auto") ! RESULT = dgemm !$loopy end diff --git a/examples/python/call-external.py b/examples/python/call-external.py index c13d99bd06295096c26d6e113841c853f80645fc..104d12f38a96b6a70aa2313c6ab3a8884e67c696 100644 --- a/examples/python/call-external.py +++ b/examples/python/call-external.py @@ -68,8 +68,8 @@ class BLASCallable(lp.ScalarCallable): par_dtype).expr for par, par_dtype in zip( parameters, par_dtypes)] - c_parameters.insert(0, var('CblasRowMajor')) - c_parameters.insert(1, var('CblasNoTrans')) + c_parameters.insert(0, var("CblasRowMajor")) + c_parameters.insert(1, var("CblasNoTrans")) c_parameters.insert(2, mat_descr.shape[0]) c_parameters.insert(3, mat_descr.shape[1]) c_parameters.insert(4, 1) @@ -85,8 +85,8 @@ class BLASCallable(lp.ScalarCallable): def blas_fn_lookup(target, identifier): - if identifier == 'gemv': - return BLASCallable(name='gemv') + if identifier == "gemv": + return BLASCallable(name="gemv") return None # }}} @@ -99,9 +99,9 @@ knl = lp.make_kernel( """ y[:] = gemv(A[:, :], x[:]) """, [ - lp.GlobalArg('A', dtype=np.float64, shape=(n, n)), - lp.GlobalArg('x', dtype=np.float64, shape=(n, )), - lp.GlobalArg('y', shape=(n, )), ...], + lp.GlobalArg("A", dtype=np.float64, shape=(n, n)), + lp.GlobalArg("x", dtype=np.float64, shape=(n, )), + lp.GlobalArg("y", shape=(n, )), ...], target=CTarget(), lang_version=(2018, 2)) diff --git a/examples/python/hello-loopy.py b/examples/python/hello-loopy.py index 764cea0e63036ff1a1338cce1210c9e198e954a7..ad0028d19a466474eed5e49cf9526424de4a60a7 100644 --- a/examples/python/hello-loopy.py +++ b/examples/python/hello-loopy.py @@ -2,7 +2,7 @@ import numpy as np import loopy as lp import pyopencl as cl import pyopencl.array -from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401 # setup # ----- diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py index 90f31f0946d06edf5565e744b9080c59c66818ca..ce40487b1f41a6a591134a21eeb14113fd8be4fa 100644 --- a/examples/python/ispc-stream-harness.py +++ b/examples/python/ispc-stream-harness.py @@ -54,7 +54,7 @@ else: def main(): - with open("tasksys.cpp", "r") as ts_file: + with open("tasksys.cpp") as ts_file: tasksys_source = ts_file.read() def make_knl(name, insn, vars): diff --git a/examples/python/rank-one.py b/examples/python/rank-one.py index b8da89c6c75986e3baf5e35ee76b680d08c51632..aa2a650feb165684a9d65207772e093568b9f98e 100644 --- a/examples/python/rank-one.py +++ b/examples/python/rank-one.py @@ -33,8 +33,10 @@ evt, (c,) = knl(queue, a=a, b=b) split_knl = knl # PREFETCH1BEGIN -knl = lp.add_prefetch(knl, "a") -knl = lp.add_prefetch(knl, "b") +knl = lp.add_prefetch(knl, "a", + fetch_outer_inames="i_outer, i_inner, j_outer, j_inner") +knl = lp.add_prefetch(knl, "b", + fetch_outer_inames="i_outer, i_inner, j_outer, j_inner") # PREFETCH1END knl = lp.set_options(knl, write_code=True) @@ -43,8 +45,14 @@ evt, (c,) = knl(queue, a=a, b=b) knl = split_knl # PREFETCH2BEGIN -knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag="l.0") -knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag="l.0") +knl = lp.add_prefetch(knl, "a", ["i_inner"], + fetch_outer_inames="i_outer, j_outer, j_inner", + temporary_address_space=lp.AddressSpace.LOCAL, + default_tag="l.0") +knl = lp.add_prefetch(knl, "b", ["j_inner"], + fetch_outer_inames="i_outer, j_outer, j_inner", + temporary_address_space=lp.AddressSpace.LOCAL, + default_tag="l.0") # PREFETCH2END knl = lp.set_options(knl, write_code=True) @@ -58,8 +66,10 @@ knl = lp.split_iname(knl, "i", 256, knl = lp.split_iname(knl, "j", 256, outer_tag="g.1", slabs=(0, 1)) -knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag=None) -knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag=None) +knl = lp.add_prefetch(knl, "a", ["i_inner"], + fetch_outer_inames="i_outer, j_outer", default_tag=None) +knl = lp.add_prefetch(knl, "b", ["j_inner"], + fetch_outer_inames="i_outer, j_outer", default_tag=None) knl = lp.split_iname(knl, "i_inner", 16, inner_tag="l.0") diff --git a/loopy/__init__.py b/loopy/__init__.py index 819eccbd3a0feb303a528098b70cb8d3d411f079..36eabd0a38ba60d995f55218ab67acbb4162609f 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -23,9 +21,6 @@ THE SOFTWARE. """ -import six -from six.moves import range, zip - from loopy.symbolic import ( TaggedVariable, Reduction, LinearSubscript, TypeCast) from loopy.diagnostic import LoopyError, LoopyWarning @@ -36,7 +31,7 @@ from loopy.program import iterate_over_kernels_if_given_program from loopy.kernel.instruction import ( MemoryOrdering, memory_ordering, MemoryScope, memory_scope, - VarAtomicity, AtomicInit, AtomicUpdate, + VarAtomicity, OrderedAtomic, AtomicInit, AtomicUpdate, InstructionBase, MultiAssignmentBase, Assignment, ExpressionInstruction, CallInstruction, CInstruction, NoOpInstruction, BarrierInstruction) @@ -79,7 +74,7 @@ from loopy.transform.iname import ( affine_map_inames, find_unused_axis_tag, make_reduction_inames_unique, has_schedulable_iname_nesting, get_iname_duplication_options, - add_inames_to_insn) + add_inames_to_insn, add_inames_for_unused_hw_axes) from loopy.transform.instruction import ( find_instructions, map_instructions, @@ -171,7 +166,7 @@ __all__ = [ "MemoryScope", "memory_scope", # lower case is deprecated "VarAtomicity", - "AtomicInit", "AtomicUpdate", + "OrderedAtomic", "AtomicInit", "AtomicUpdate", "InstructionBase", "MultiAssignmentBase", "Assignment", "ExpressionInstruction", "CallInstruction", "CInstruction", "NoOpInstruction", @@ -204,7 +199,7 @@ __all__ = [ "affine_map_inames", "find_unused_axis_tag", "make_reduction_inames_unique", "has_schedulable_iname_nesting", "get_iname_duplication_options", - "add_inames_to_insn", + "add_inames_to_insn", "add_inames_for_unused_hw_axes", "add_prefetch", "change_arg_to_image", "tag_array_axes", "tag_data_axes", @@ -334,7 +329,7 @@ def set_options(kernel, *args, **kwargs): from loopy.options import _apply_legacy_map, Options kwargs = _apply_legacy_map(Options._legacy_options_map, kwargs) - for key, val in six.iteritems(kwargs): + for key, val in kwargs.items(): if not hasattr(new_opt, key): raise ValueError("unknown option '%s'" % key) @@ -440,7 +435,7 @@ def set_caching_enabled(flag): CACHING_ENABLED = flag -class CacheMode(object): +class CacheMode: """A context manager for setting whether :mod:`loopy` is allowed to use disk caches. """ @@ -487,10 +482,10 @@ def make_copy_kernel(new_dim_tags, old_dim_tags=None): shape = ["n%d" % i for i in range(rank)] commad_indices = ", ".join(indices) bounds = " and ".join( - "0<=%s<%s" % (ind, shape_i) + f"0<={ind}<{shape_i}" for ind, shape_i in zip(indices, shape)) - set_str = "{[%s]: %s}" % ( + set_str = "{{[{}]: {}}}".format( commad_indices, bounds ) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index ebc07e1fce26f8d0f405ca5e699e32480e21fa4d..73b11b70bbfc8110f7bfed272c88d79d267a218a 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -22,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from six.moves import range, zip import os from warnings import warn @@ -452,7 +449,7 @@ def auto_test_vs_ref( ref_implemented_data_info = ref_codegen_result.implemented_data_info - logger.info("%s (ref): trying %s for the reference calculation" % ( + logger.info("{} (ref): trying {} for the reference calculation".format( ref_prog.name, dev)) if not quiet and print_ref_code: @@ -490,7 +487,7 @@ def auto_test_vs_ref( ref_queue.finish() - logger.info("%s (ref): using %s for the reference calculation" % ( + logger.info("{} (ref): using {} for the reference calculation".format( ref_prog.name, dev)) logger.info("%s (ref): run" % ref_prog.name) @@ -526,6 +523,16 @@ def auto_test_vs_ref( queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + from loopy.kernel import KernelState + from loopy.target.pyopencl import PyOpenCLTarget + if test_prog.state not in [ + KernelState.PREPROCESSED, + KernelState.LINEARIZED]: + if isinstance(test_prog.target, PyOpenCLTarget): + test_prog = test_prog.copy(target=PyOpenCLTarget(ctx.devices[0])) + + test_prog = lp.preprocess_kernel(test_prog) + from loopy.type_inference import infer_unknown_types test_prog = infer_unknown_types(test_prog, expect_completion=True) @@ -634,7 +641,7 @@ def auto_test_vs_ref( rates = "" for cnt, lbl in zip(op_count, op_label): - rates += " %g %s/s" % (cnt/elapsed_wall, lbl) + rates += " {:g} {}/s".format(cnt/elapsed_wall, lbl) if not quiet: def format_float_or_none(v): @@ -652,10 +659,28 @@ def auto_test_vs_ref( if do_check: ref_rates = "" for cnt, lbl in zip(op_count, op_label): - ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl) + rates += " {:g} {}/s".format(cnt/elapsed_wall, lbl) + if not quiet: - print("ref: elapsed: %g s event, %g s wall%s" % ( - ref_elapsed_event, ref_elapsed_wall, ref_rates)) + def format_float_or_none(v): + if v is None: + return "" + else: + return "%g" % v + + print("elapsed: %s s event, %s s marker-event %s s wall " + "(%d rounds)%s" % ( + format_float_or_none(elapsed_event), + format_float_or_none(elapsed_event_marker), + format_float_or_none(elapsed_wall), timing_rounds, rates)) + + if do_check: + ref_rates = "" + for cnt, lbl in zip(op_count, op_label): + ref_rates += " {:g} {}/s".format(cnt/ref_elapsed_event, lbl) + if not quiet: + print("ref: elapsed: {:g} s event, {:g} s wall{}".format( + ref_elapsed_event, ref_elapsed_wall, ref_rates)) # }}} diff --git a/loopy/check.py b/loopy/check.py index b0cf68f08224c44f8e2186a80fd31e99b7f67ebf..44fbfe155fd778350cc6fee642b4fa856ebb6fc3 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import, division, print_function - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -22,16 +20,16 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six -from six.moves import range from islpy import dim_type import islpy as isl from loopy.symbolic import WalkMapper, CombineMapper, ResolvedFunction -from loopy.diagnostic import LoopyError, WriteRaceConditionWarning, warn_with_kernel +from loopy.diagnostic import (LoopyError, WriteRaceConditionWarning, + warn_with_kernel, ExpressionToAffineConversionError) from loopy.type_inference import TypeInferenceMapper from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction, CInstruction, _DataObliviousInstruction) +from warnings import warn from functools import reduce @@ -39,6 +37,35 @@ import logging logger = logging.getLogger(__name__) +__doc__ = """ +.. currentmodule:: loopy.check + +.. autofunction:: check_for_integer_subscript_indices + +.. autofunction:: check_for_duplicate_insn_ids + +.. autofunction:: check_for_double_use_of_hw_axes + +.. autofunction:: check_insn_attributes + +.. autofunction:: check_loop_priority_inames_known + +.. autofunction:: check_multiple_tags_allowed + +.. autofunction:: check_for_inactive_iname_access + +.. autofunction:: check_for_unused_inames + +.. autofunction:: check_for_write_races + +.. autofunction:: check_for_data_dependent_parallel_bounds + +.. autofunction:: check_bounds + +.. autofunction:: check_variable_access_ordered +""" + + # {{{ sanity checks run before preprocessing def check_identifiers_in_subst_rules(knl): @@ -50,7 +77,7 @@ def check_identifiers_in_subst_rules(knl): allowed_identifiers = knl.all_variable_names() - for rule in six.itervalues(knl.substitutions): + for rule in knl.substitutions.values(): deps = get_dependencies(rule.expression) rule_allowed_identifiers = allowed_identifiers | frozenset(rule.arguments) @@ -84,11 +111,11 @@ class UnscopedCallCollector(CombineMapper): def map_call_with_kwargs(self, expr): if not isinstance(expr.function, ResolvedFunction): return (frozenset([expr.function.name]) | - self.combine((self.rec(child) for child in expr.parameters - + tuple(expr.kw_parameters.values())))) + self.combine(self.rec(child) for child in expr.parameters + + tuple(expr.kw_parameters.values()))) else: - return self.combine((self.rec(child) for child in - expr.parameters+tuple(expr.kw_parameters.values()))) + return self.combine(self.rec(child) for child in + expr.parameters+tuple(expr.kw_parameters.values())) def map_constant(self, expr): return frozenset() @@ -136,7 +163,13 @@ VALID_NOSYNC_SCOPES = frozenset(["local", "global", "any"]) class SubscriptIndicesIsIntChecker(TypeInferenceMapper): def map_subscript(self, expr): for idx in expr.index_tuple: - if not self.rec(idx)[0].is_integral(): + type_inf_result = self.rec(idx) + if not type_inf_result: + raise LoopyError( + "When checking that subscript indices are integral: " + "Type inference did not find type of '%s'" + % idx) + if not type_inf_result[0].is_integral(): raise LoopyError("Non-integral array indices obtained in" " {}.".format(expr)) @@ -144,6 +177,9 @@ class SubscriptIndicesIsIntChecker(TypeInferenceMapper): def check_for_integer_subscript_indices(kernel, callables_table): + """ + Checks is every array access is of type :class:`int`. + """ from pymbolic.primitives import Subscript idx_int_checker = SubscriptIndicesIsIntChecker(kernel, callables_table) for insn in kernel.instructions: @@ -160,7 +196,10 @@ def check_for_integer_subscript_indices(kernel, callables_table): def check_insn_attributes(kernel): - all_insn_ids = set(insn.id for insn in kernel.instructions) + """ + Check for legality of attributes of every instruction in *kernel*. + """ + all_insn_ids = {insn.id for insn in kernel.instructions} for insn in kernel.instructions: if not insn.within_inames <= kernel.all_inames(): @@ -175,14 +214,14 @@ def check_insn_attributes(kernel): % (insn.id, ", ".join( insn.depends_on - all_insn_ids))) - no_sync_with_insn_ids = set(id for id, scope in insn.no_sync_with) + no_sync_with_insn_ids = {id for id, scope in insn.no_sync_with} if not no_sync_with_insn_ids <= all_insn_ids: raise LoopyError("insn '%s' has nosync directive with unknown " "instruction ids: %s" % (insn.id, ", ".join(no_sync_with_insn_ids - all_insn_ids))) - no_sync_with_scopes = set(scope for id, scope in insn.no_sync_with) + no_sync_with_scopes = {scope for id, scope in insn.no_sync_with} if not no_sync_with_scopes <= VALID_NOSYNC_SCOPES: raise LoopyError("insn '%s' has invalid nosync scopes: %s" % (insn.id, @@ -190,6 +229,10 @@ def check_insn_attributes(kernel): def check_for_duplicate_insn_ids(knl): + """ + Check if multiple instructions of *knl* have the same + :attr:`loopy.InstructionBase.id`. + """ insn_ids = set() for insn in knl.instructions: @@ -201,6 +244,10 @@ def check_for_duplicate_insn_ids(knl): def check_loop_priority_inames_known(kernel): + """ + Checks if the inames in :attr:`loopy.LoopKernel.loop_priority` are part of + the *kernel*'s domain. + """ for prio in kernel.loop_priority: for iname in prio: if iname not in kernel.all_inames(): @@ -215,26 +262,33 @@ def _get_all_unique_iname_tags(kernel): from itertools import chain iname_tags = list(chain(*(kernel.iname_to_tags.get(iname, []) for iname in kernel.all_inames()))) - return set( + return { tag for tag in iname_tags if - isinstance(tag, UniqueTag)) + isinstance(tag, UniqueTag)} def check_multiple_tags_allowed(kernel): + """ + Checks if a multiple tags of an iname are compatible. + """ from loopy.kernel.data import (GroupIndexTag, LocalIndexTag, VectorizeTag, UnrollTag, ForceSequentialTag, IlpBaseTag, filter_iname_tags_by_type) illegal_combinations = [ (GroupIndexTag, LocalIndexTag, VectorizeTag, UnrollTag, ForceSequentialTag), (IlpBaseTag, ForceSequentialTag) ] - for iname, tags in six.iteritems(kernel.iname_to_tags): + for iname, tags in kernel.iname_to_tags.items(): for comb in illegal_combinations: if len(filter_iname_tags_by_type(tags, comb)) > 1: - raise LoopyError("iname {0} has illegal combination of " - "tags: {1}".format(iname, tags)) + raise LoopyError("iname {} has illegal combination of " + "tags: {}".format(iname, tags)) def check_for_double_use_of_hw_axes(kernel, callables_table): + """ + Check if any instruction of *kernel* is within multiple inames tagged with + the same hw axis tag. + """ from loopy.kernel.data import UniqueTag from loopy.kernel.instruction import CallInstruction from loopy.kernel.function_interface import CallableKernel @@ -267,6 +321,9 @@ def check_for_double_use_of_hw_axes(kernel, callables_table): def check_for_inactive_iname_access(kernel): + """ + Check if any instruction accesses an iname but is not within it. + """ for insn in kernel.instructions: expression_inames = insn.read_dependency_names() & kernel.all_inames() @@ -280,6 +337,22 @@ def check_for_inactive_iname_access(kernel): kernel.insn_inames(insn)), kernel.name)) +def check_for_unused_inames(kernel): + """ + Check if there are any unused inames in the kernel. + """ + # Warn if kernel has unused inames + from loopy.transform.iname import get_used_inames + unused_inames = kernel.all_inames() - get_used_inames(kernel) + if unused_inames: + warn_with_kernel( + kernel, "unused_inames", + "Found unused inames in kernel: %s " + "Unused inames during linearization will be prohibited in " + "Loopy version 2021.X." + % unused_inames) + + def _is_racing_iname_tag(tv, tag): from loopy.kernel.data import (AddressSpace, LocalIndexTagBase, GroupIndexTag, ConcurrentTag, auto) @@ -307,6 +380,9 @@ def _is_racing_iname_tag(tv, tag): def check_for_write_races(kernel): + """ + Check if any memory accesses lead to write races. + """ from loopy.kernel.data import ConcurrentTag for insn in kernel.instructions: @@ -324,16 +400,16 @@ def check_for_write_races(kernel): # Any concurrent tags that are not depended upon by the assignee # will cause write races. - raceable_parallel_insn_inames = set( + raceable_parallel_insn_inames = { iname for iname in kernel.insn_inames(insn) - if kernel.iname_tags_of_type(iname, ConcurrentTag)) + if kernel.iname_tags_of_type(iname, ConcurrentTag)} elif assignee_name in kernel.temporary_variables: temp_var = kernel.temporary_variables[assignee_name] - raceable_parallel_insn_inames = set( + raceable_parallel_insn_inames = { iname for iname in kernel.insn_inames(insn) if any(_is_racing_iname_tag(temp_var, tag) - for tag in kernel.iname_tags(iname))) + for tag in kernel.iname_tags(iname))} else: raise LoopyError("invalid assignee name in instruction '%s'" @@ -355,7 +431,7 @@ def check_for_orphaned_user_hardware_axes(kernel): from loopy.kernel.data import LocalIndexTag for axis in kernel.local_sizes: found = False - for tags in six.itervalues(kernel.iname_to_tags): + for tags in kernel.iname_to_tags.values(): for tag in tags: if isinstance(tag, LocalIndexTag) and tag.axis == axis: found = True @@ -369,13 +445,17 @@ def check_for_orphaned_user_hardware_axes(kernel): def check_for_data_dependent_parallel_bounds(kernel): + """ + Check that inames tagged as hw axes have bounds that are known at kernel + launch. + """ from loopy.kernel.data import ConcurrentTag for i, dom in enumerate(kernel.domains): dom_inames = set(dom.get_var_names(dim_type.set)) - par_inames = set( + par_inames = { iname for iname in dom_inames - if kernel.iname_tags_of_type(iname, ConcurrentTag)) + if kernel.iname_tags_of_type(iname, ConcurrentTag)} if not par_inames: continue @@ -392,13 +472,12 @@ def check_for_data_dependent_parallel_bounds(kernel): # {{{ check access bounds class _AccessCheckMapper(WalkMapper): - def __init__(self, kernel, domain, insn_id): + def __init__(self, kernel, insn_id): self.kernel = kernel - self.domain = domain self.insn_id = insn_id - def map_subscript(self, expr): - WalkMapper.map_subscript(self, expr) + def map_subscript(self, expr, domain): + WalkMapper.map_subscript(self, expr, domain) from pymbolic.primitives import Variable assert isinstance(expr.aggregate, Variable) @@ -421,7 +500,7 @@ class _AccessCheckMapper(WalkMapper): from loopy.symbolic import (get_dependencies, get_access_range, UnableToDetermineAccessRange) - available_vars = set(self.domain.get_var_dict()) + available_vars = set(domain.get_var_dict()) shape_deps = set() for shape_axis in shape: if shape_axis is not None: @@ -438,8 +517,7 @@ class _AccessCheckMapper(WalkMapper): len(subscript), len(shape))) try: - access_range = get_access_range(self.domain, subscript, - self.kernel.assumptions) + access_range = get_access_range(domain, subscript) except UnableToDetermineAccessRange: # Likely: index was non-affine, nothing we can do. return @@ -462,8 +540,29 @@ class _AccessCheckMapper(WalkMapper): " establish '%s' is a subset of '%s')." % (expr, self.insn_id, access_range, shape_domain)) + def map_if(self, expr, domain): + from loopy.symbolic import get_dependencies + if get_dependencies(expr.condition) <= frozenset( + domain.space.get_var_dict()): + try: + from loopy.symbolic import isl_set_from_expr + then_set = isl_set_from_expr(domain.space, expr.condition) + else_set = then_set.complement() + except ExpressionToAffineConversionError: + # non-affine condition: can't do much + then_set = else_set = isl.BasicSet.universe(domain.space) + else: + # data-dependent condition: can't do much + then_set = else_set = isl.BasicSet.universe(domain.space) + + self.rec(expr.then, domain & then_set) + self.rec(expr.else_, domain & else_set) + def check_bounds(kernel): + """ + Performs out-of-bound check for every array access. + """ temp_var_names = set(kernel.temporary_variables) for insn in kernel.instructions: domain = kernel.get_inames_domain(kernel.insn_inames(insn)) @@ -472,10 +571,12 @@ def check_bounds(kernel): if set(domain.get_var_names(dim_type.param)) & temp_var_names: continue - acm = _AccessCheckMapper(kernel, domain, insn.id) + acm = _AccessCheckMapper(kernel, insn.id) + domain, assumptions = isl.align_two(domain, kernel.assumptions) + domain_with_assumptions = domain & assumptions def run_acm(expr): - acm(expr) + acm(expr, domain_with_assumptions) return expr insn.with_transformed_expressions(run_acm) @@ -519,7 +620,7 @@ def check_has_schedulable_iname_nesting(kernel): if not has_schedulable_iname_nesting_for_single_kernel(kernel): import itertools as it opt = get_iname_duplication_options_for_single_kernel(kernel) - opt_str = "\n".join("* Duplicate %s within instructions %s" % (i, w) + opt_str = "\n".join(f"* Duplicate {i} within instructions {w}" for i, w in it.islice(opt, 3)) raise LoopyError("Kernel does not have a schedulable iname nesting. " "In order for there to exist a feasible loop nesting, you " @@ -533,45 +634,9 @@ def check_has_schedulable_iname_nesting(kernel): # {{{ check_variable_access_ordered -class IndirectDependencyEdgeFinder(object): - def __init__(self, kernel): - self.kernel = kernel - self.dep_edge_cache = {} - - def __call__(self, depender_id, dependee_id): - cache_key = (depender_id, dependee_id) - - try: - result = self.dep_edge_cache[cache_key] - except KeyError: - pass - else: - if result is None: - from loopy.diagnostic import DependencyCycleFound - raise DependencyCycleFound("when " - "checking for dependency edge between " - "depender '%s' and dependee '%s'" - % (depender_id, dependee_id)) - else: - return result - - depender = self.kernel.id_to_insn[depender_id] - - if dependee_id in depender.depends_on: - self.dep_edge_cache[cache_key] = True - return True - - self.dep_edge_cache[cache_key] = None - for dep in depender.depends_on: - if self(dep, dependee_id): - self.dep_edge_cache[cache_key] = True - return True - - self.dep_edge_cache[cache_key] = False - return False - - def declares_nosync_with(kernel, var_address_space, dep_a, dep_b): + dep_a = kernel.id_to_insn[dep_a] + dep_b = kernel.id_to_insn[dep_b] from loopy.kernel.data import AddressSpace if var_address_space == AddressSpace.GLOBAL: search_scopes = ["global", "any"] @@ -594,127 +659,215 @@ def declares_nosync_with(kernel, var_address_space, dep_a, dep_b): return ab_nosync and ba_nosync +def _get_address_space(kernel, var): + from loopy.kernel.data import ValueArg, AddressSpace, ArrayArg + if var in kernel.temporary_variables: + address_space = kernel.temporary_variables[var].address_space + else: + arg = kernel.arg_dict[var] + if isinstance(arg, ArrayArg): + address_space = arg.address_space + elif isinstance(arg, ValueArg): + address_space = AddressSpace.PRIVATE + else: + # No need to consider ConstantArg and ImageArg (for now) + # because those won't be written. + raise ValueError("could not determine address_space of '%s'" % var) + return address_space + + +def _get_topological_order(kernel): + """ + Returns a :class:`list` of insn ids of *kernel* in a topological sort + order. + + If there is a dependency cycle within the instructions of *kernel* raises a + :class:`loopy.diagnostic.DependencyCycleFound` exception. + """ + from pytools.graph import compute_sccs + from loopy.diagnostic import DependencyCycleFound + + dep_map = {insn.id: insn.depends_on for insn in kernel.instructions} + + # pytools.graph.compute_sccs serves 2 purposes: + # 1. computes topological sort order of instructions. + # 2. provides info. about any cycles in the graph. + sccs = compute_sccs(dep_map) + order = [] + + for scc in sccs: + if len(scc) != 1: + raise DependencyCycleFound(", ".join(scc)) + order.append(scc[0]) + + return order + + def _check_variable_access_ordered_inner(kernel): - logger.debug("%s: check_variable_access_ordered: start" % kernel.name) + from loopy.kernel.tools import find_aliasing_equivalence_classes + from loopy.symbolic import AccessRangeOverlapChecker + overlap_checker = AccessRangeOverlapChecker(kernel) + aliasing_equiv_classes = find_aliasing_equivalence_classes(kernel) - checked_variables = kernel.get_written_variables() & ( - set(kernel.temporary_variables) | set(arg for arg in kernel.arg_dict)) + # dep_reqs_to_vars: A mapping (writer_id, dep_req_id) -> set of variable names, + # where the tuple denotes a pair of instructions IDs, and the variable + # names are the ones that necessitate a dependency. + # + # Note: This can be worst-case O(n^2) in the number of instructions. + dep_reqs_to_vars = {} wmap = kernel.writer_map() rmap = kernel.reader_map() - from loopy.kernel.data import ValueArg, AddressSpace, ArrayArg - from loopy.kernel.tools import find_aliasing_equivalence_classes - - depfind = IndirectDependencyEdgeFinder(kernel) - aliasing_equiv_classes = find_aliasing_equivalence_classes(kernel) + # {{{ populate 'dep_reqs_to_vars' - for name in checked_variables: - # This is a tad redundant in that this could probably be restructured - # to iterate only over equivalence classes and not individual variables. - # But then the access-range overlap check below would have to be smarter. - eq_class = aliasing_equiv_classes[name] + for var in kernel.get_written_variables(): + address_space = _get_address_space(kernel, var) + eq_class = aliasing_equiv_classes[var] readers = set.union( *[rmap.get(eq_name, set()) for eq_name in eq_class]) writers = set.union( *[wmap.get(eq_name, set()) for eq_name in eq_class]) - unaliased_readers = rmap.get(name, set()) - unaliased_writers = wmap.get(name, set()) - - if not writers: - continue - if name in kernel.temporary_variables: - address_space = kernel.temporary_variables[name].address_space - else: - arg = kernel.arg_dict[name] - if isinstance(arg, ArrayArg): - address_space = arg.address_space - elif isinstance(arg, ValueArg): - address_space = AddressSpace.PRIVATE - else: - # No need to consider ConstantArg and ImageArg (for now) - # because those won't be written. - raise ValueError("could not determine address_space of '%s'" % name) - - # Check even for PRIVATE address space, to ensure intentional program order. - - from loopy.symbolic import AccessRangeOverlapChecker - overlap_checker = AccessRangeOverlapChecker(kernel) + for writer in writers: + required_deps = (readers | writers) - {writer} + required_deps = {req_dep + for req_dep in required_deps + if not declares_nosync_with(kernel, address_space, writer, + req_dep)} - for writer_id in writers: - for other_id in readers | writers: - if writer_id == other_id: - continue - - writer = kernel.id_to_insn[writer_id] - other = kernel.id_to_insn[other_id] + for req_dep in required_deps: + dep_reqs_to_vars.setdefault((writer, req_dep), set()).add(var) - has_dependency_relationship = ( - declares_nosync_with(kernel, address_space, other, writer) - or - depfind(writer_id, other_id) - or - depfind(other_id, writer_id) - ) + # }}} - if has_dependency_relationship: - continue + # depends_on: mapping from insn_ids to their dependencies + depends_on = {insn.id: set() for insn in + kernel.instructions} + # rev_depends: mapping from insn_ids to their reverse deps. + rev_depends = {insn.id: set() for insn in + kernel.instructions} - is_relationship_by_aliasing = not ( - writer_id in unaliased_writers - and (other_id in unaliased_writers - or other_id in unaliased_readers)) + # {{{ populate rev_depends, depends_on - # Do not enforce ordering for disjoint access ranges - if (not is_relationship_by_aliasing and not - overlap_checker.do_access_ranges_overlap_conservative( - writer_id, "w", other_id, "any", name)): - continue + for insn in kernel.instructions: + depends_on[insn.id].update(insn.depends_on) + for dep in insn.depends_on: + rev_depends[dep].add(insn.id) + + # }}} + + # {{{ remove pairs from dep_reqs_to_vars for which dependencies exist + + topological_order = _get_topological_order(kernel) + + def discard_dep_reqs_in_order(dep_reqs_to_vars, edges, order): + """ + Subtracts dependency requirements of insn_ids by all direct/indirect + predecessors of a directed graph of insn_ids as nodes and *edges* as + the connectivity. + + :arg order: An instance of :class:`list` of instruction ids in which the + *edges* graph is to be traversed. + """ + # predecessors: mapping from insn_id to its direct/indirect + # predecessors + predecessors = {} + + for insn_id in order: + # insn_predecessors:insn_id's direct+indirect predecessors + + # This set of predecessors is complete because we're + # traversing in topological order: No predecessor + # can occur after the instruction itself. + insn_predecessors = predecessors.pop(insn_id, set()) + + for pred in insn_predecessors: + dep_reqs_to_vars.pop( + (insn_id, pred), + # don't fail if pair doesn't exist + None) + + for successor in edges[insn_id]: + predecessors.setdefault(successor, set()).update( + insn_predecessors | {insn_id}) + + # forward dep. graph traversal in reverse topological sort order + # (proceeds "end of program" -> "beginning of program") + discard_dep_reqs_in_order(dep_reqs_to_vars, depends_on, + topological_order[::-1]) + + # reverse dep. graph traversal in topological sort order + # (proceeds "beginning of program" -> "end of program") + discard_dep_reqs_in_order(dep_reqs_to_vars, rev_depends, topological_order) + + # }}} + + # {{{ handle dependency requirements that weren't satisfied + + for (writer_id, other_id), variables in dep_reqs_to_vars.items(): + writer = kernel.id_to_insn[writer_id] + other = kernel.id_to_insn[other_id] + + for var in variables: + eq_class = aliasing_equiv_classes[var] + unaliased_readers = rmap.get(var, set()) + unaliased_writers = wmap.get(var, set()) + + is_relationship_by_aliasing = not ( + writer_id in unaliased_writers + and (writer_id in unaliased_writers + or other_id in unaliased_readers)) + + # Do not enforce ordering for disjoint access ranges + if (not is_relationship_by_aliasing and not + overlap_checker.do_access_ranges_overlap_conservative( + writer_id, "w", other_id, "any", var)): + continue - # Do not enforce ordering for aliasing-based relationships - # in different groups. - if (is_relationship_by_aliasing and ( - bool(writer.groups & other.conflicts_with_groups) - or - bool(other.groups & writer.conflicts_with_groups))): - continue + # Do not enforce ordering for aliasing-based relationships + # in different groups. + if (is_relationship_by_aliasing and ( + bool(writer.groups & other.conflicts_with_groups) + or + bool(other.groups & writer.conflicts_with_groups))): + continue - msg = ("No dependency relationship found between " - "'{writer_id}' which writes {var} and " - "'{other_id}' which also accesses {var}. " - "Either add a (possibly indirect) dependency " - "between the two, or add them to each others' nosync " - "set to indicate that no ordering is intended, or " - "turn off this check by setting the " - "'enforce_variable_access_ordered' option " - "(more issues of this type may exist--only reporting " - "the first one)" - .format( - writer_id=writer_id, - other_id=other_id, - var=( - "the variable '%s'" % name - if len(eq_class) == 1 - else ( - "the aliasing equivalence class '%s'" - % ", ".join(eq_class)) - ))) - - from loopy.diagnostic import VariableAccessNotOrdered - raise VariableAccessNotOrdered(msg) - - logger.debug("%s: check_variable_access_ordered: done" % kernel.name) + msg = ("No dependency relationship found between " + "'{writer_id}' which writes {var} and " + "'{other_id}' which also accesses {var}. " + "Either add a (possibly indirect) dependency " + "between the two, or add them to each others' nosync " + "set to indicate that no ordering is intended, or " + "turn off this check by setting the " + "'enforce_variable_access_ordered' option " + "(more issues of this type may exist--only reporting " + "the first one)" + .format( + writer_id=writer_id, + other_id=other_id, + var=( + "the variable '%s'" % var + if len(eq_class) == 1 + else ( + "the aliasing equivalence class '%s'" + % ", ".join(eq_class)) + ))) + + from loopy.diagnostic import VariableAccessNotOrdered + raise VariableAccessNotOrdered(msg) + + # }}} def check_variable_access_ordered(kernel): """Checks that between each write to a variable and all other accesses to the variable there is either: - * an (at least indirect) depdendency edge, or + * a direct/indirect depdendency edge, or * an explicit statement that no ordering is necessary (expressed - through a bi-directional :attr:`loopy.Instruction.no_sync_with`) + through a bi-directional :attr:`loopy.InstructionBase.no_sync_with`) """ if kernel.options.enforce_variable_access_ordered not in [ @@ -728,30 +881,17 @@ def check_variable_access_ordered(kernel): if kernel.options.enforce_variable_access_ordered == "no_check": return - if kernel.options.enforce_variable_access_ordered: - try: - _check_variable_access_ordered_inner(kernel) - except RuntimeError as e: - if isinstance(e.args[0], str) and ( - e.args[0].startswith('maximum recursion depth exceeded')): - from loopy.diagnostic import warn_with_kernel - warn_with_kernel(kernel, "recursion_error_reached_in_check", str(e)) - else: - raise e - else: - from loopy.diagnostic import VariableAccessNotOrdered - try: + from pytools import ProcessLogger + with ProcessLogger(logger, "%s: check variable access ordered" % kernel.name): + if kernel.options.enforce_variable_access_ordered: _check_variable_access_ordered_inner(kernel) - except VariableAccessNotOrdered as e: - from loopy.diagnostic import warn_with_kernel - warn_with_kernel(kernel, "variable_access_ordered", str(e)) - except RuntimeError as e: - if isinstance(e.args[0], str) and ( - e.args[0].startswith('maximum recursion depth exceeded')): + else: + from loopy.diagnostic import VariableAccessNotOrdered + try: + _check_variable_access_ordered_inner(kernel) + except VariableAccessNotOrdered as e: from loopy.diagnostic import warn_with_kernel - warn_with_kernel(kernel, "recursion_error_reached_in_check", str(e)) - else: - raise e + warn_with_kernel(kernel, "variable_access_ordered", str(e)) # }}} @@ -770,6 +910,7 @@ def pre_schedule_checks(kernel, callables_table): check_loop_priority_inames_known(kernel) check_multiple_tags_allowed(kernel) check_for_inactive_iname_access(kernel) + check_for_unused_inames(kernel) check_for_write_races(kernel) check_for_data_dependent_parallel_bounds(kernel) check_bounds(kernel) @@ -793,12 +934,75 @@ def pre_schedule_checks(kernel, callables_table): # {{{ check for unused hw axes +# {{{ find boostable insn ids + +def _find_boostable_insn_ids(kernel): + """There used to exist a broken heuristic called "boostability" that allowed + instructions to be pushed into hardware-parallel loops. This function survives + of that, for now, to provide a thin veneer of compatibility. + """ + logger.debug("%s: idempotence" % kernel.name) + + writer_map = kernel.writer_map() + + arg_names = {arg.name for arg in kernel.args} + + var_names = arg_names | set(kernel.temporary_variables.keys()) + + reads_map = { + insn.id: insn.read_dependency_names() & var_names + for insn in kernel.instructions} + + from collections import defaultdict + dep_graph = defaultdict(set) + + for insn in kernel.instructions: + dep_graph[insn.id] = {writer_id + for var in reads_map[insn.id] + for writer_id in writer_map.get(var, set())} + + # Find SCCs of dep_graph. These are used for checking if the instruction is + # in a dependency cycle. + from pytools.graph import compute_sccs + + sccs = {item: scc + for scc in compute_sccs(dep_graph) + for item in scc} + + non_idempotently_updated_vars = set() + boostable_insn_ids = set() + + for insn in kernel.instructions: + boostable = len(sccs[insn.id]) == 1 and insn.id not in dep_graph[insn.id] + + if boostable: + boostable_insn_ids.add(insn.id) + else: + non_idempotently_updated_vars.update( + insn.assignee_var_names()) + + # {{{ remove boostability from isns that access non-idempotently updated vars + + for insn_id in boostable_insn_ids.copy(): + insn = kernel.id_to_insn[insn_id] + if bool(non_idempotently_updated_vars & insn.dependency_names()): + boostable_insn_ids.remove(insn_id) + + # }}} + + return boostable_insn_ids + +# }}} + + def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table, sched_index=None): from loopy.schedule import (CallKernel, RunInstruction, Barrier, EnterLoop, LeaveLoop, ReturnFromKernel, get_insn_ids_for_block_at, gather_schedule_block) + boostable_insn_ids = _find_boostable_insn_ids(kernel) + if sched_index is None: group_axes = set() local_axes = set() @@ -812,8 +1016,8 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table, get_insn_ids_for_block_at(kernel.schedule, sched_index), callables_table) - group_axes = set(ax for ax, length in enumerate(group_size)) - local_axes = set(ax for ax, length in enumerate(local_size)) + group_axes = {ax for ax, length in enumerate(group_size)} + local_axes = {ax for ax, length in enumerate(local_size)} i = sched_index + 1 assert isinstance(kernel.schedule[past_end_i - 1], ReturnFromKernel) @@ -834,9 +1038,6 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table, insn = kernel.id_to_insn[sched_item.insn_id] i += 1 - if insn.boostable: - continue - group_axes_used = set() local_axes_used = set() @@ -856,17 +1057,44 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table, raise LoopyError("auto local tag encountered") if group_axes != group_axes_used: - raise LoopyError("instruction '%s' does not use all group hw axes " - "(available: %s used:%s)" - % (insn.id, - ",".join(str(i) for i in group_axes), - ",".join(str(i) for i in group_axes_used))) + if insn.id in boostable_insn_ids: + warn("instruction '%s' does not use all group hw axes" + " (available: %s used:%s). Loopy will generate code" + " with the instruction executed along all the" + " missing hw axes. This will result in an" + " error from 2021.x onwards, calling" + " loopy.add_inames_for_unused_hw_axes(...)" + " might help in the transition." + % (insn.id, + ",".join(str(i) for i in group_axes), + ",".join(str(i) for i in group_axes_used)), + DeprecationWarning, stacklevel=2) + else: + raise LoopyError("instruction '%s' does not use all group" + " hw axes (available: %s used:%s)" + % (insn.id, + ",".join(str(i) for i in group_axes), + ",".join(str(i) for i in group_axes_used))) + if local_axes != local_axes_used: - raise LoopyError("instruction '%s' does not use all local hw axes " - "(available: %s used:%s)" - % (insn.id, - ",".join(str(i) for i in local_axes), - ",".join(str(i) for i in local_axes_used))) + if insn.id in boostable_insn_ids: + warn("instruction '%s' does not use all local hw axes" + " (available: %s used:%s). Loopy will generate code" + " with the instruction executed along all the" + " missing hw axes. This will result in an" + " error from 2021.x onwards, calling" + " loopy.add_inames_for_unused_hw_axes(...)" + " might help in the transition." + % (insn.id, + ",".join(str(i) for i in local_axes), + ",".join(str(i) for i in local_axes_used)), + DeprecationWarning, stacklevel=2) + else: + raise LoopyError("instruction '%s' does not use all local" + " hw axes (available: %s used:%s)" + % (insn.id, + ",".join(str(i) for i in local_axes), + ",".join(str(i) for i in local_axes_used))) elif isinstance(sched_item, (Barrier, EnterLoop, LeaveLoop)): i += 1 @@ -893,18 +1121,18 @@ def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel): from loopy.kernel.data import ArrayBase, Assignment from loopy.types import AtomicType atomicity_candidates = ( - set(v.name for v in six.itervalues(kernel.temporary_variables) - if isinstance(v.dtype, AtomicType)) + {v.name for v in kernel.temporary_variables.values() + if isinstance(v.dtype, AtomicType)} | - set(v.name for v in kernel.args + {v.name for v in kernel.args if isinstance(v, ArrayBase) - and isinstance(v.dtype, AtomicType))) + and isinstance(v.dtype, AtomicType)}) for insn in kernel.instructions: if not isinstance(insn, Assignment): continue - atomic_accesses = set(a.var_name for a in insn.atomicity) + atomic_accesses = {a.var_name for a in insn.atomicity} if not atomic_accesses <= atomicity_candidates: raise LoopyError("atomic access in instruction '%s' to " "non-atomic variable(s) '%s'" @@ -970,12 +1198,12 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): def check_that_all_insns_are_scheduled(kernel): - all_schedulable_insns = set(insn.id for insn in kernel.instructions) + all_schedulable_insns = {insn.id for insn in kernel.instructions} from loopy.schedule import sched_item_to_insn_id - scheduled_insns = set( + scheduled_insns = { insn_id for sched_item in kernel.schedule - for insn_id in sched_item_to_insn_id(sched_item)) + for insn_id in sched_item_to_insn_id(sched_item)} assert scheduled_insns <= all_schedulable_insns @@ -983,7 +1211,7 @@ def check_that_all_insns_are_scheduled(kernel): from loopy.diagnostic import UnscheduledInstructionError raise UnscheduledInstructionError( "unscheduled instructions: '%s'" - % ', '.join(all_schedulable_insns - scheduled_insns)) + % ", ".join(all_schedulable_insns - scheduled_insns)) # }}} @@ -996,11 +1224,11 @@ def check_that_shapes_and_strides_are_arguments(kernel): from loopy.symbolic import get_dependencies import loopy as lp - integer_arg_names = set( + integer_arg_names = { arg.name for arg in kernel.args if isinstance(arg, ValueArg) - and arg.dtype.is_integral()) + and arg.dtype.is_integral()} for arg in kernel.args: if isinstance(arg, ArrayBase): @@ -1069,7 +1297,7 @@ def check_implemented_domains(kernel, implemented_domains, code=None): last_idomains = None last_insn_inames = None - for insn_id, idomains in six.iteritems(implemented_domains): + for insn_id, idomains in implemented_domains.items(): insn = kernel.id_to_insn[insn_id] assert idomains @@ -1127,9 +1355,9 @@ def check_implemented_domains(kernel, implemented_domains, code=None): i_minus_d = insn_impl_domain - desired_domain d_minus_i = desired_domain - insn_impl_domain - parameter_inames = set( + parameter_inames = { insn_domain.get_dim_name(dim_type.param, i) - for i in range(insn_impl_domain.dim(dim_type.param))) + for i in range(insn_impl_domain.dim(dim_type.param))} lines = [] for bigger, smaller, diff_set, gist_domain in [ @@ -1157,10 +1385,10 @@ def check_implemented_domains(kernel, implemented_domains, code=None): iname, pt.get_coordinate_val(tp, dim).to_python())) lines.append( - "sample point in %s but not %s: %s" % ( + "sample point in {} but not {}: {}".format( bigger, smaller, ", ".join(point_axes))) lines.append( - "gist of constraints in %s but not %s: %s" % ( + "gist of constraints in {} but not {}: {}".format( smaller, bigger, gist_domain)) if code is not None: diff --git a/loopy/cli.py b/loopy/cli.py index 3dbdeb41e37aebc0e3c2b0b8b3fc68866dfec080..a7d209ae87b2120f90a8d360c3ff9eb13bc925f5 100644 --- a/loopy/cli.py +++ b/loopy/cli.py @@ -1,5 +1,3 @@ -from __future__ import print_function - import sys import loopy as lp @@ -39,16 +37,16 @@ def defines_to_python_code(defines_str): import re define_re = re.compile(r"^\#define\s+([a-zA-Z0-9_]+)\s+(.*)$") result = [] - for l in defines_str.split("\n"): - if not l.strip(): + for line in defines_str.split("\n"): + if not line.strip(): continue - match = define_re.match(l) + match = define_re.match(line) if match is None: - raise RuntimeError("#define not understood: '%s'" % l) + raise RuntimeError("#define not understood: '%s'" % line) result.append( - "%s = %s" % (match.group(1), to_python_literal(match.group(2)))) + "{} = {}".format(match.group(1), to_python_literal(match.group(2)))) return "\n".join(result) @@ -60,7 +58,7 @@ def main(): parser.add_argument("infile", metavar="INPUT_FILE") parser.add_argument("outfile", default="-", metavar="OUTPUT_FILE", - help="Defaults to stdout ('-').", nargs='?') + help="Defaults to stdout ('-').", nargs="?") parser.add_argument("--lang", metavar="LANGUAGE", help="loopy|fortran") parser.add_argument("--target", choices=( "opencl", "ispc", "ispc-occa", "c", "c-fortran", "cuda"), @@ -112,7 +110,7 @@ def main(): ".f77": "fortran", ".F77": "fortran", }.get(ext) - with open(args.infile, "r") as infile_fd: + with open(args.infile) as infile_fd: infile_content = infile_fd.read() if args.lang is not None: @@ -143,15 +141,15 @@ def main(): data_dic["np"] = np if args.occa_defines: - with open(args.occa_defines, "r") as defines_fd: + with open(args.occa_defines) as defines_fd: occa_define_code = defines_to_python_code(defines_fd.read()) exec(compile(occa_define_code, args.occa_defines, "exec"), data_dic) - with open(args.infile, "r") as infile_fd: + with open(args.infile) as infile_fd: exec(compile(infile_content, args.infile, "exec"), data_dic) if args.transform: - with open(args.transform, "r") as xform_fd: + with open(args.transform) as xform_fd: exec(compile(xform_fd.read(), args.transform, "exec"), data_dic) @@ -166,14 +164,14 @@ def main(): elif lang in ["fortran", "floopy", "fpp"]: pre_transform_code = None if args.transform: - with open(args.transform, "r") as xform_fd: + with open(args.transform) as xform_fd: pre_transform_code = xform_fd.read() if args.occa_defines: if pre_transform_code is None: pre_transform_code = "" - with open(args.occa_defines, "r") as defines_fd: + with open(args.occa_defines) as defines_fd: pre_transform_code = ( defines_to_python_code(defines_fd.read()) + pre_transform_code) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 70cd7cc956acdfdc59402851b081602ca78ce187..e324c6d77248711a18b7d1ca29702791d9688e9e 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -25,8 +23,6 @@ THE SOFTWARE. import logging logger = logging.getLogger(__name__) -import six - from loopy.diagnostic import LoopyError, warn from pytools import ImmutableRecord import islpy as isl @@ -44,6 +40,22 @@ from cgen import Collection from pytools import ProcessLogger +__doc__ = """ +.. currentmodule:: loopy.codegen + +.. autoclass:: ImplementedDataInfo + +.. autoclass:: PreambleInfo + +.. autoclass:: VectorizationInfo + +.. autoclass:: SeenFunction + +.. autoclass:: CodeGenerationState + +.. automodule:: loopy.codegen.result +""" + # {{{ implemented data info @@ -123,7 +135,7 @@ class Unvectorizable(Exception): pass -class VectorizationInfo(object): +class VectorizationInfo: """ .. attribute:: iname .. attribute:: length @@ -152,7 +164,7 @@ class SeenFunction(ImmutableRecord): arg_dtypes=arg_dtypes) -class CodeGenerationState(object): +class CodeGenerationState: """ .. attribute:: kernel .. attribute:: target @@ -436,7 +448,7 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target): from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel, callables_table) - if kernel.state != KernelState.SCHEDULED: + if kernel.state != KernelState.LINEARIZED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") @@ -488,9 +500,8 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target): raise ValueError("argument type not understood: '%s'" % type(arg)) allow_complex = False - for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): - dtype = var.dtype - if dtype.involves_complex(): + for var in kernel.args + list(kernel.temporary_variables.values()): + if var.dtype.involves_complex(): allow_complex = True # }}} @@ -534,10 +545,12 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target): # {{{ handle preambles - for arg in kernel.args: - seen_dtypes.add(arg.dtype) - for tv in six.itervalues(kernel.temporary_variables): - seen_dtypes.add(tv.dtype) + for idi in codegen_state.implemented_data_info: + seen_dtypes.add(idi.dtype) + + for tv in kernel.temporary_variables.values(): + for idi in tv.decl_info(kernel.target, index_dtype=kernel.index_dtype): + seen_dtypes.add(idi.dtype) preambles = kernel.preambles[:] diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index b736191ec1dadb842e12453fbec3b68e831338f6..b02c13b389266379a03c41b6f60c2163b16b2986 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -1,5 +1,3 @@ -from __future__ import division - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -29,11 +27,13 @@ from islpy import dim_type # {{{ approximate, convex bounds check generator -def get_approximate_convex_bounds_checks(domain, check_inames, implemented_domain): +def get_approximate_convex_bounds_checks(domain, check_inames, + implemented_domain, op_cache_manager): if isinstance(domain, isl.BasicSet): domain = isl.Set.from_basic_set(domain) domain = domain.remove_redundancies() - result = domain.eliminate_except(check_inames, [dim_type.set]) + result = op_cache_manager.eliminate_except(domain, check_inames, + (dim_type.set,)) # This is ok, because we're really looking for the # projection, with no remaining constraints from diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 3bad73462598c61895f2d274c13941e433986cb4..c2006df518f19d19e241ce8c699243314076b1ce 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -1,6 +1,5 @@ """Loop nest build top-level control/hoisting.""" -from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -201,14 +200,14 @@ def get_required_predicates(kernel, sched_index): return result -def group_by(l, key, merge): - if not l: - return l +def group_by(entry, key, merge): + if not entry: + return entry result = [] - previous = l[0] + previous = entry[0] - for item in l[1:]: + for item in entry[1:]: if key(previous) == key(item): previous = merge(previous, item) @@ -329,7 +328,7 @@ def build_loop_nest(codegen_state, schedule_index): # Each instruction individually gets its bounds checks, # so we can safely overapproximate here. return get_approximate_convex_bounds_checks(domain, - check_inames, self.impl_domain) + check_inames, self.impl_domain, self.kernel.cache_manager) def build_insn_group(sched_index_info_entries, codegen_state, done_group_lengths=set()): @@ -475,7 +474,7 @@ def build_loop_nest(codegen_state, schedule_index): sched_index_info_entries[0:group_length], inner_codegen_state, done_group_lengths=( - done_group_lengths | set([group_length]))) + done_group_lengths | {group_length})) # gen_code returns a list diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 5e0747246160ddc2934c3d545c03a2a9b4090d5d..71133ef7cf2a29be1a8673e99a81f21544f5404a 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -1,6 +1,5 @@ """Code generation for Instruction objects.""" -from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -25,7 +24,6 @@ THE SOFTWARE. """ -from six.moves import range import islpy as isl dim_type = isl.dim_type from loopy.codegen import Unvectorizable @@ -39,7 +37,8 @@ def to_codegen_result( chk_domain = isl.Set.from_basic_set(domain) chk_domain = chk_domain.remove_redundancies() - chk_domain = chk_domain.eliminate_except(check_inames, [dim_type.set]) + chk_domain = codegen_state.kernel.cache_manager.eliminate_except(chk_domain, + check_inames, (dim_type.set,)) chk_domain, implemented_domain = isl.align_two( chk_domain, codegen_state.implemented_domain) @@ -171,7 +170,7 @@ def generate_assignment_instruction_code(codegen_state, insn): gs, ls = kernel.get_grid_size_upper_bounds() - printf_format = "%s.%s[%s][%s]: %s" % ( + printf_format = "{}.{}[{}][{}]: {}".format( kernel.name, insn.id, ", ".join("gid%d=%%d" % i for i in range(len(gs))), @@ -208,7 +207,7 @@ def generate_assignment_instruction_code(codegen_state, insn): else: printf_args_str = "" - printf_insn = S("printf(\"%s\\n\"%s)" % ( + printf_insn = S('printf("{}\\n"{})'.format( printf_format, printf_args_str)) from cgen import Block @@ -274,7 +273,7 @@ def generate_c_instruction_code(codegen_state, insn): if body: body.append(Line()) - body.extend(Line(l) for l in insn.code.split("\n")) + body.extend(Line(line) for line in insn.code.split("\n")) return Block(body) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 5796f5133a1d82890c55accf28072dd5db582ee4..59dd33c95507e0e9b790ec5740f2256279393e67 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -22,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from six.moves import range from loopy.diagnostic import warn, LoopyError from loopy.codegen.result import merge_codegen_results @@ -80,11 +77,16 @@ def get_slab_decomposition(kernel, iname): if upper_incr: assert upper_incr > 0 - upper_slab = ("final", isl.BasicSet.universe(space) - .add_constraint( - isl.Constraint.inequality_from_aff( - iname_rel_aff(space, - iname, ">", upper_bound_aff-upper_incr)))) + upper_bset = isl.BasicSet.universe(space).add_constraint( + isl.Constraint.inequality_from_aff( + iname_rel_aff(space, + iname, ">", upper_bound_aff-upper_incr))) + if lower_incr: + # Ensure that this slab is actually distinct from the + # lower one, if it exists. + _, lower_bset = lower_slab + upper_bset, = upper_bset.subtract(lower_bset).get_basic_sets() + upper_slab = ("final", upper_bset) upper_bulk_bound = ( isl.Constraint.inequality_from_aff( iname_rel_aff(space, @@ -320,7 +322,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, if len(slabs) > 1: result.append( codegen_state.ast_builder.emit_comment( - "%s slab for '%s'" % (slab_name, iname))) + f"{slab_name} slab for '{iname}'")) # Have the conditional infrastructure generate the # slabbing conditionals. @@ -359,7 +361,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): result = [] for slab_name, slab in slabs: - cmt = "%s slab for '%s'" % (slab_name, loop_iname) + cmt = f"{slab_name} slab for '{loop_iname}'" if len(slabs) == 1: cmt = None diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 7950c56b3b62693f974cbcc5ab8686f30fa42cbe..d7314fb9750d63dd2f42282be6e1340e2ce073de 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2016 Andreas Kloeckner" __license__ = """ @@ -22,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six from pytools import ImmutableRecord @@ -43,6 +40,19 @@ def process_preambles(preambles): for lines in dedup_preambles] +__doc__ = """ +.. currentmodule:: loopy.codegen.result + +.. autoclass:: GeneratedProgram + +.. autoclass:: CodeGenerationResult + +.. autofunction:: merge_codegen_results + +.. autofunction:: generate_host_or_device_program +""" + + # {{{ code generation result class GeneratedProgram(ImmutableRecord): @@ -218,7 +228,7 @@ def merge_codegen_results(codegen_state, elements, collapse=True): el.current_program(codegen_state).name == codegen_result.current_program(codegen_state).name) - for insn_id, idoms in six.iteritems(el.implemented_domains): + for insn_id, idoms in el.implemented_domains.items(): implemented_domains.setdefault(insn_id, []).extend(idoms) if not codegen_state.is_generating_device_code: diff --git a/loopy/compiled.py b/loopy/compiled.py index 613bca56fc1de23a66d45d8f990f91f9d3f9b949..f9313c6c95612ddba6566d7c8175d998e8312147 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2016 Andreas Kloeckner" __license__ = """ @@ -30,11 +28,14 @@ from loopy.target.pyopencl_execution import ( # noqa # {{{ compatibility class CompiledKernel(PyOpenCLKernelExecutor): + """ + .. automethod:: __call__ + """ def __init__(self, context, kernel): from warnings import warn warn("CompiledKernel is deprecated. Use LoopKernel.__call__ directly.", DeprecationWarning, stacklevel=2) - super(CompiledKernel, self).__init__(context, kernel) + super().__init__(context, kernel) # }}} diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py index 561bbc7cc56a8338593a80b7d5890553af89c79b..0ae2e530ad5e3c0de73b3d0d064f7dd85e055894 100644 --- a/loopy/diagnostic.py +++ b/loopy/diagnostic.py @@ -1,5 +1,3 @@ -from __future__ import division - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -60,7 +58,7 @@ def warn_with_kernel(kernel, id, text, type=LoopyWarning): % id) from warnings import warn - warn("in kernel %s: %s" % (kernel.name, text), type, stacklevel=2) + warn(f"in kernel {kernel.name}: {text}", type, stacklevel=2) warn = MovedFunctionDeprecationWrapper(warn_with_kernel) diff --git a/loopy/expression.py b/loopy/expression.py index 8414efaa5dd614d39e93f55aea3836141e5a6d6e..10e19301470eadecc6f3d206373fb7c5df1c5ae8 100644 --- a/loopy/expression.py +++ b/loopy/expression.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import, print_function - __copyright__ = "Copyright (C) 2012-15 Andreas Kloeckner" __license__ = """ @@ -32,20 +30,20 @@ from loopy.diagnostic import LoopyError # type_context may be: -# - 'i' for integer - -# - 'f' for single-precision floating point -# - 'd' for double-precision floating point +# - "i" for integer - +# - "f" for single-precision floating point +# - "d" for double-precision floating point # or None for 'no known context'. def dtype_to_type_context(target, dtype): from loopy.types import NumpyType if dtype.is_integral(): - return 'i' + return "i" if isinstance(dtype, NumpyType) and dtype.dtype in [np.float64, np.complex128]: - return 'd' + return "d" if isinstance(dtype, NumpyType) and dtype.dtype in [np.float32, np.complex64]: - return 'f' + return "f" if target.is_vector_dtype(dtype): return dtype_to_type_context( target, NumpyType(dtype.numpy_dtype.fields["x"][0])) diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index 9b63c10f8422d0a17c295e1ef9a4609f5db90e2b..c8fda36d070c3aab49fec4f9d828d9130ad8358c 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -1,5 +1,3 @@ -from __future__ import division, with_statement - __copyright__ = "Copyright (C) 2013 Andreas Kloeckner" __license__ = """ @@ -90,17 +88,17 @@ def _extract_loopy_lines(source): loopy_lines = [] in_loopy_code = False - for l in lines: - comment_match = comment_re.match(l) + for line in lines: + comment_match = comment_re.match(line) if comment_match is None: if in_loopy_code: raise LoopyError("non-comment source line in loopy block") - remaining_lines.append(l) + remaining_lines.append(line) # Preserves line numbers in loopy code, for debuggability - loopy_lines.append("# "+l) + loopy_lines.append("# "+line) continue cmt = comment_match.group(1) @@ -112,7 +110,7 @@ def _extract_loopy_lines(source): in_loopy_code = True # Preserves line numbers in loopy code, for debuggability - loopy_lines.append("# "+l) + loopy_lines.append("# "+line) elif cmt_stripped == "$loopy end": if not in_loopy_code: @@ -120,16 +118,16 @@ def _extract_loopy_lines(source): in_loopy_code = False # Preserves line numbers in loopy code, for debuggability - loopy_lines.append("# "+l) + loopy_lines.append("# "+line) elif in_loopy_code: loopy_lines.append(cmt) else: - remaining_lines.append(l) + remaining_lines.append(line) # Preserves line numbers in loopy code, for debuggability - loopy_lines.append("# "+l) + loopy_lines.append("# "+line) return "\n".join(remaining_lines), "\n".join(loopy_lines) @@ -322,9 +320,9 @@ def parse_fortran(source, filename="", free_form=None, strict=None, import logging console = logging.StreamHandler() console.setLevel(logging.INFO) - formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') + formatter = logging.Formatter("%(name)-12s: %(levelname)-8s %(message)s") console.setFormatter(formatter) - logging.getLogger('fparser').addHandler(console) + logging.getLogger("fparser").addHandler(console) from fparser import api tree = api.parse(source, isfree=free_form, isstrict=strict, diff --git a/loopy/frontend/fortran/diagnostic.py b/loopy/frontend/fortran/diagnostic.py index 7cb3c79cc646f0959f69614e5141441e8fc3261b..b2ea02c05b53e132dddaa5d8102620e4941f35cd 100644 --- a/loopy/frontend/fortran/diagnostic.py +++ b/loopy/frontend/fortran/diagnostic.py @@ -1,5 +1,3 @@ -from __future__ import division, with_statement - __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" __license__ = """ diff --git a/loopy/frontend/fortran/expression.py b/loopy/frontend/fortran/expression.py index 1400fb3b71416355229f11a1e6bbd74e62b4897f..cc93e914d0470c423812b69913a7185dca9c7b67 100644 --- a/loopy/frontend/fortran/expression.py +++ b/loopy/frontend/fortran/expression.py @@ -1,5 +1,3 @@ -from __future__ import division, with_statement - __copyright__ = "Copyright (C) 2013 Andreas Kloeckner" __license__ = """ @@ -25,7 +23,7 @@ THE SOFTWARE. from pymbolic.parser import Parser as ExpressionParserBase from loopy.frontend.fortran.diagnostic import TranslationError -from six.moves import intern +from sys import intern import numpy as np import pytools.lex diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 39c2c62d97b23cd44f64ab59920e4336991a47b5..8e3ef5728fa9e0b5ebfc4348f6cc0daf03733ddd 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -1,5 +1,3 @@ -from __future__ import division, with_statement - __copyright__ = "Copyright (C) 2013 Andreas Kloeckner" __license__ = """ @@ -24,8 +22,7 @@ THE SOFTWARE. import re -import six -from six.moves import intern +from sys import intern import loopy as lp import numpy as np @@ -125,7 +122,7 @@ class SubscriptIndexAdjuster(IdentityMapper): # {{{ scope -class Scope(object): +class Scope: def __init__(self, subprogram_name, arg_names=set()): self.subprogram_name = subprogram_name @@ -163,8 +160,8 @@ class Scope(object): def known_names(self): return (self.used_names - | set(six.iterkeys(self.dim_map)) - | set(six.iterkeys(self.type_map))) + | set(self.dim_map.keys()) + | set(self.type_map.keys())) def is_known(self, name): return (name in self.used_names diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py index a124757f4729d270b0ab47c7e07cf1c436733045..f4eea255b9b89dba0300f1e81194b0ff64d7007d 100644 --- a/loopy/frontend/fortran/tree.py +++ b/loopy/frontend/fortran/tree.py @@ -1,5 +1,3 @@ -from __future__ import division, with_statement - __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" __license__ = """ @@ -27,7 +25,7 @@ import re from loopy.diagnostic import LoopyError -class FTreeWalkerBase(object): +class FTreeWalkerBase: def __init__(self, filename): from loopy.frontend.fortran.expression import FortranExpressionParser self.expr_parser = FortranExpressionParser(self) diff --git a/loopy/ipython_ext.py b/loopy/ipython_ext.py index ec1b10f1f512e18079f44b94b298e876776cae35..7f9177e0ef8430cc450cb462641b12ed1a9f9b28 100644 --- a/loopy/ipython_ext.py +++ b/loopy/ipython_ext.py @@ -1,5 +1,3 @@ -from __future__ import division - from IPython.core.magic import (magics_class, Magics, cell_magic) import loopy as lp diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 4d57de26b6cfa3d8932ba4f85ed02b97ddcda975..59748e01baa7d387514a7a0619f8482d58c363e7 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -1,6 +1,5 @@ """isl helpers""" -from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -25,13 +24,6 @@ THE SOFTWARE. """ -import six -import numpy as np -from six.moves import range, zip - -from pymbolic.mapper.evaluator import \ - EvaluationMapper as EvaluationMapperBase - from loopy.diagnostic import StaticValueFindingError, LoopyError import islpy as isl @@ -499,130 +491,6 @@ def obj_involves_variable(obj, var_name): return False -# {{{ performance tweak for dim_{min,max}: project first - -def _runs_in_integer_set(s, max_int=None): - if not s: - return - - if max_int is None: - max_int = max(s) - - i = 0 - while i < max_int: - if i in s: - start = i - - i += 1 - while i < max_int and i in s: - i += 1 - - end = i - - yield (start, end-start) - - else: - i += 1 - - -class TooManyInteractingDims(Exception): - pass - - -def _find_aff_dims(aff, dim_types_and_gen_dim_types): - result = [] - - for dt, gen_dt in dim_types_and_gen_dim_types: - for i in range(aff.dim(dt)): - if not aff.get_coefficient_val(dt, i).is_zero(): - result.append((gen_dt, i)) - - result = set(result) - - for i in range(aff.dim(dim_type.div)): - if not aff.get_coefficient_val(dim_type.div, i).is_zero(): - result.update(_find_aff_dims( - aff.get_div(i), - dim_types_and_gen_dim_types)) - - return result - - -def _transitive_closure(graph_dict): - pass - - -def _find_noninteracting_dims(obj, dt, idx, other_dt, stop_at=6): - if isinstance(obj, isl.BasicSet): - basics = [obj] - elif isinstance(obj, isl.Set): - basics = obj.get_basic_sets() - else: - raise TypeError("unsupported arg type '%s'" % type(obj)) - - connections = [] - for bs in basics: - for c in bs.get_constraints(): - conn = _find_aff_dims( - c.get_aff(), - [(dim_type.param, dim_type.param), (dim_type.in_, dim_type.set)]) - if len(conn) > 1: - connections.append(conn) - - interacting = set([(dt, idx)]) - - while True: - changed_something = False - - # Compute the connected component near (dt, idx) by fixed point iteration - - for conn in connections: - prev_len = len(interacting) - - overlap = interacting & conn - if overlap: - interacting.update(conn) - - if len(interacting) != prev_len: - changed_something = True - - if len(interacting) >= stop_at: - raise TooManyInteractingDims() - - if not changed_something: - break - - return set(range(obj.dim(other_dt))) - set( - idx for dt, idx in interacting - if dt == other_dt) - - -def _eliminate_noninteracting(obj, dt, idx, other_dt): - obj = obj.compute_divs() - try: - nonint = _find_noninteracting_dims(obj, dt, idx, other_dt) - - except TooManyInteractingDims: - return obj - - for first, n in _runs_in_integer_set(nonint): - obj = obj.eliminate(other_dt, first, n) - - return obj - - -def dim_min_with_elimination(obj, idx): - obj_elim = _eliminate_noninteracting(obj, dim_type.out, idx, dim_type.param) - return obj_elim.dim_min(idx) - - -def dim_max_with_elimination(obj, idx): - obj_elim = _eliminate_noninteracting(obj, dim_type.out, idx, dim_type.param) - return obj_elim.dim_max(idx) - -# }}} - - # {{{ get_simple_strides def get_simple_strides(bset, key_by="name"): @@ -718,7 +586,7 @@ def get_simple_strides(bset, key_by="name"): # }}} -# {{{{ find_max_of_pwaff_with_params +# {{{ find_max_of_pwaff_with_params def find_max_of_pwaff_with_params(pw_aff, n_allowed_params): if n_allowed_params is None: @@ -743,30 +611,6 @@ def find_max_of_pwaff_with_params(pw_aff, n_allowed_params): # {{{ subst_into_pwqpolynomial -class QPolynomialEvaluationMapper(EvaluationMapperBase): - def __init__(self, space): - self.zero = isl.QPolynomial.zero_on_domain(space) - - context = {} - for name, (dt, pos) in six.iteritems(space.get_var_dict()): - if dt == dim_type.set: - dt = dim_type.in_ - - context[name] = isl.QPolynomial.var_on_domain(space, dt, pos) - - super(QPolynomialEvaluationMapper, self).__init__(context) - - def map_constant(self, expr): - if isinstance(expr, np.integer): - expr = int(expr) - - return self.zero + expr - - def map_quotient(self, expr): - raise TypeError("true division in '%s' not supported " - "for as-pwaff evaluation" % expr) - - def get_param_subst_domain(new_space, base_obj, subst_dict): """Modify the :mod:`islpy` object *base_obj* to incorporate parameters for the keys of *subst_dict*, and rename existing parameters to include a @@ -828,8 +672,18 @@ def get_param_subst_domain(new_space, base_obj, subst_dict): def subst_into_pwqpolynomial(new_space, poly, subst_dict): + """ + Returns an instance of :class:`islpy.PwQPolynomial` with substitutions from + *subst_dict* substituted into *poly*. + + :arg poly: an instance of :class:`islpy.PwQPolynomial` + :arg subst_dict: a mapping from parameters of *poly* to + :class:`pymbolic.primitives.Expression` made up of terms comprising the + parameters of *new_space*. The expression must be affine in the param + dims of *new_space*. + """ if not poly.get_pieces(): - assert new_space.is_params() + # pw poly is univserally zero result = isl.PwQPolynomial.zero(new_space.insert_dims(dim_type.out, 0, 1)) assert result.dim(dim_type.out) == 1 return result @@ -839,7 +693,7 @@ def subst_into_pwqpolynomial(new_space, poly, subst_dict): poly, subst_domain, subst_dict = get_param_subst_domain( new_space, poly, subst_dict) - from loopy.symbolic import qpolynomial_to_expr + from loopy.symbolic import qpolynomial_to_expr, qpolynomial_from_expr new_pieces = [] for valid_set, qpoly in poly.get_pieces(): valid_set = valid_set & subst_domain @@ -851,7 +705,7 @@ def subst_into_pwqpolynomial(new_space, poly, subst_dict): SubstitutionMapper, make_subst_func) sub_mapper = SubstitutionMapper(make_subst_func(subst_dict)) expr = sub_mapper(qpolynomial_to_expr(qpoly)) - qpoly = QPolynomialEvaluationMapper(valid_set.space)(expr) + qpoly = qpolynomial_from_expr(valid_set.space, expr) new_pieces.append((valid_set, qpoly)) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 0ce06a126ef435f99b32c89bcd576beba648a3bb..1eac93e415663bbca818c591c050d2543a469683 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1,7 +1,5 @@ """Kernel object.""" -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -24,8 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six -from six.moves import range, zip, intern +from sys import intern from collections import defaultdict @@ -49,7 +46,7 @@ from warnings import warn class _UniqueVarNameGenerator(UniqueNameGenerator): def __init__(self, existing_names=set(), forced_prefix=""): - super(_UniqueVarNameGenerator, self).__init__(existing_names, forced_prefix) + super().__init__(existing_names, forced_prefix) array_prefix_pattern = re.compile("(.*)_s[0-9]+$") array_prefixes = set() @@ -95,7 +92,7 @@ class _UniqueVarNameGenerator(UniqueNameGenerator): # {{{ loop kernel object -class _deprecated_KernelState_SCHEDULED(object): # noqa +class _deprecated_KernelState_SCHEDULED: # noqa def __init__(self, f): self.f = f @@ -117,7 +114,7 @@ class KernelState: # noqa # {{{ kernel_state, KernelState compataibility -class _deperecated_kernel_state_class_method(object): # noqa +class _deperecated_kernel_state_class_method: # noqa def __init__(self, f): self.f = f @@ -127,7 +124,7 @@ class _deperecated_kernel_state_class_method(object): # noqa return self.f() -class kernel_state(object): # noqa +class kernel_state: # noqa """Deprecated. Use :class:`loopy.kernel.KernelState` instead. """ @@ -241,6 +238,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): would be called from other top level kernels. Default value is *True*. + .. automethod:: __call__ + .. automethod:: copy """ # {{{ constructor @@ -351,7 +350,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): if state not in [ KernelState.INITIAL, KernelState.PREPROCESSED, - KernelState.SCHEDULED, + KernelState.LINEARIZED, ]: raise ValueError("invalid value for 'state'") @@ -375,7 +374,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): from collections import defaultdict assert not isinstance(iname_to_tags, defaultdict) - for iname, tags in six.iteritems(iname_to_tags): + for iname, tags in iname_to_tags.items(): # don't tolerate empty sets assert tags assert isinstance(tags, frozenset) @@ -479,25 +478,25 @@ class LoopKernel(ImmutableRecordWithoutPickling): @memoize_method def non_iname_variable_names(self): - return (set(six.iterkeys(self.arg_dict)) - | set(six.iterkeys(self.temporary_variables))) + return (set(self.arg_dict.keys()) + | set(self.temporary_variables.keys())) @memoize_method def all_variable_names(self, include_temp_storage=True): return ( - set(six.iterkeys(self.temporary_variables)) - | set(tv.base_storage - for tv in six.itervalues(self.temporary_variables) - if tv.base_storage is not None and include_temp_storage) - | set(six.iterkeys(self.substitutions)) - | set(arg.name for arg in self.args) + set(self.temporary_variables.keys()) + | {tv.base_storage + for tv in self.temporary_variables.values() + if tv.base_storage is not None and include_temp_storage} + | set(self.substitutions.keys()) + | {arg.name for arg in self.args} | set(self.all_inames())) def get_var_name_generator(self): return _UniqueVarNameGenerator(self.all_variable_names()) def get_instruction_id_generator(self, based_on="insn"): - used_ids = set(insn.id for insn in self.instructions) + used_ids = {insn.id for insn in self.instructions} return UniqueNameGenerator(used_ids) @@ -506,7 +505,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): if insns is None: insns = self.instructions - used_ids = set(insn.id for insn in insns) | extra_used_ids + used_ids = {insn.id for insn in insns} | extra_used_ids for id_str in generate_unique_names(based_on): if id_str not in used_ids: @@ -554,7 +553,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): @property @memoize_method def id_to_insn(self): - return dict((insn.id, insn) for insn in self.instructions) + return {insn.id: insn for insn in self.instructions} # }}} @@ -649,10 +648,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): @memoize_method def _get_home_domain_map(self): - return dict( - (iname, i_domain) + return { + iname: i_domain for i_domain, dom in enumerate(self.domains) - for iname in dom.get_var_names(dim_type.set)) + for iname in dom.get_var_names(dim_type.set)} def get_home_domain_index(self, iname): return self._get_home_domain_map()[iname] @@ -828,7 +827,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): @memoize_method def all_referenced_inames(self): result = set() - for inames in six.itervalues(self.all_insn_inames()): + for inames in self.all_insn_inames().values(): result.update(inames) return result @@ -839,8 +838,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): @memoize_method def iname_to_insns(self): - result = dict( - (iname, set()) for iname in self.all_inames()) + result = { + iname: set() for iname in self.all_inames()} for insn in self.instructions: for iname in self.insn_inames(insn): result[iname].add(insn.id) @@ -866,9 +865,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): tag, = tags tag_key_uses[tag.key].append(iname) - multi_use_keys = set( - key for key, user_inames in six.iteritems(tag_key_uses) - if len(user_inames) > 1) + multi_use_keys = { + key for key, user_inames in tag_key_uses.items() + if len(user_inames) > 1} multi_use_inames = set() for iname in cond_inames: @@ -888,13 +887,13 @@ class LoopKernel(ImmutableRecordWithoutPickling): warn("Since version 2018.1, inames can hold multiple tags. Use " "iname_to_tags['iname'] instead. iname_to_tag.get('iname') will be " "removed at version 2019.0.", DeprecationWarning) - for iname, tags in six.iteritems(self.iname_to_tags): + for iname, tags in self.iname_to_tags.items(): if len(tags) > 1: raise LoopyError( - "iname {0} has multiple tags: {1}. " + "iname {} has multiple tags: {}. " "Use iname_to_tags['iname'] instead.".format(iname, tags)) - return dict((k, next(iter(v))) - for k, v in six.iteritems(self.iname_to_tags) if v) + return {k: next(iter(v)) + for k, v in self.iname_to_tags.items() if v} # }}} @@ -944,8 +943,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): result = {} admissible_vars = ( - set(arg.name for arg in self.args) - | set(six.iterkeys(self.temporary_variables))) + {arg.name for arg in self.args} + | set(self.temporary_variables.keys())) for insn in self.instructions: for var_name in insn.read_dependency_names() & admissible_vars: @@ -987,7 +986,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): @memoize_method def get_temporary_to_base_storage_map(self): result = {} - for tv in six.itervalues(self.temporary_variables): + for tv in self.temporary_variables.values(): if tv.base_storage: result[tv.name] = tv.base_storage @@ -998,10 +997,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): written_vars = self.get_written_variables() from loopy.kernel.data import ValueArg - return set( + return { arg.name for arg in self.args - if isinstance(arg, ValueArg) and arg.name not in written_vars) + if isinstance(arg, ValueArg) and arg.name not in written_vars} # }}} @@ -1010,7 +1009,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): @property @memoize_method def arg_dict(self): - return dict((arg.name, arg) for arg in self.args) + return {arg.name: arg for arg in self.args} @property @memoize_method @@ -1032,14 +1031,14 @@ class LoopKernel(ImmutableRecordWithoutPickling): from loopy.kernel.data import ArrayArg return ( - set( + { arg.name for arg in self.args - if isinstance(arg, ArrayArg) - and arg.address_space == AddressSpace.GLOBAL) - | set( + if (isinstance(arg, ArrayArg) + and arg.address_space == AddressSpace.GLOBAL)} + | { tv.name - for tv in six.itervalues(self.temporary_variables) - if tv.address_space == AddressSpace.GLOBAL)) + for tv in self.temporary_variables.values() + if tv.address_space == AddressSpace.GLOBAL}) # }}} @@ -1207,7 +1206,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): forced_sizes = forced_sizes.copy() size_list = [] - sorted_axes = sorted(six.iterkeys(size_dict)) + sorted_axes = sorted(size_dict.keys()) while sorted_axes or forced_sizes: if sorted_axes: @@ -1286,15 +1285,15 @@ class LoopKernel(ImmutableRecordWithoutPickling): @memoize_method def local_var_names(self): from loopy.kernel.data import AddressSpace - return set( + return { tv.name - for tv in six.itervalues(self.temporary_variables) - if tv.address_space == AddressSpace.LOCAL) + for tv in self.temporary_variables.values() + if tv.address_space == AddressSpace.LOCAL} def local_mem_use(self): from loopy.kernel.data import AddressSpace return sum( - tv.nbytes for tv in six.itervalues(self.temporary_variables) + tv.nbytes for tv in self.temporary_variables.values() if tv.address_space == AddressSpace.LOCAL) # }}} @@ -1327,13 +1326,13 @@ class LoopKernel(ImmutableRecordWithoutPickling): "consistent iname nesting order. This is a possible indication " "that the kernel may not schedule successfully, but for now " "it only impacts printing of the kernel.") - embedding = dict((iname, iname) for iname in self.all_inames()) + embedding = {iname: iname for iname in self.all_inames()} return embedding def stringify(self, what=None, with_dependencies=False, use_separators=True, show_labels=True): - all_what = set([ + all_what = { "name", "arguments", "domains", @@ -1343,10 +1342,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): "instructions", "Dependencies", "schedule", - ]) + } - first_letter_to_what = dict( - (w[0], w) for w in all_what) + first_letter_to_what = { + w[0]: w for w in all_what} assert len(first_letter_to_what) == len(all_what) if what is None: @@ -1357,11 +1356,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): if isinstance(what, str): if "," in what: what = what.split(",") - what = set(s.strip() for s in what) + what = {s.strip() for s in what} else: - what = set( + what = { first_letter_to_what[w] - for w in what) + for w in what} if not (what <= all_what): raise LoopyError("invalid 'what' passed: %s" @@ -1406,14 +1405,14 @@ class LoopKernel(ImmutableRecordWithoutPickling): else: tags_str = ", ".join(str(tag) for tag in tags) - line = "%s: %s" % (iname, tags_str) + line = f"{iname}: {tags_str}" lines.append(line) if "variables" in what and kernel.temporary_variables: lines.extend(sep) if show_labels: lines.append("TEMPORARIES:") - for tv in natsorted(six.itervalues(kernel.temporary_variables), + for tv in natsorted(kernel.temporary_variables.values(), key=lambda tv: tv.name): lines.append(str(tv)) @@ -1421,7 +1420,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): lines.extend(sep) if show_labels: lines.append("SUBSTITUTION RULES:") - for rule_name in natsorted(six.iterkeys(kernel.substitutions)): + for rule_name in natsorted(kernel.substitutions.keys()): lines.append(str(kernel.substitutions[rule_name])) if "instructions" in what: @@ -1435,7 +1434,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): dep_lines = [] for insn in kernel.instructions: if insn.depends_on: - dep_lines.append("%s : %s" % (insn.id, ",".join(insn.depends_on))) + dep_lines.append("{} : {}".format( + insn.id, ",".join(insn.depends_on))) if "Dependencies" in what and dep_lines: lines.extend(sep) @@ -1456,11 +1456,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return "\n".join(lines) def __str__(self): - if six.PY3: - return self.stringify() - else: - # Path of least resistance... - return self.stringify().encode("utf-8") + return self.stringify() def __unicode__(self): return self.stringify() @@ -1478,6 +1474,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): # {{{ direct execution def __call__(self, *args, **kwargs): + """ + Execute the :class:`LoopKernel`. + """ warn("Calling a LoopKernel is deprecated, call a Program " "instead.", DeprecationWarning, stacklevel=2) from loopy.program import make_program @@ -1489,10 +1488,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): # {{{ pickling def __getstate__(self): - result = dict( - (key, getattr(self, key)) + result = { + key: getattr(self, key) for key in self.__class__.fields - if hasattr(self, key)) + if hasattr(self, key)} result.pop("cache_manager", None) @@ -1523,7 +1522,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): new_fields = set() - for k, v in six.iteritems(attribs): + for k, v in attribs.items(): setattr(self, k, v) new_fields.add(k) diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 2e43d9b605c313add3d353da38b138f9d57bb9b7..eabaa0900d9238a6b01b2784c2d46deedff701e0 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -1,6 +1,5 @@ """Implementation tagging of array axes.""" -from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -26,10 +25,6 @@ THE SOFTWARE. import re -import six -from six.moves import range, zip -from six import iteritems - from pytools import ImmutableRecord, memoize_method import numpy as np # noqa @@ -38,6 +33,25 @@ from loopy.diagnostic import LoopyError from loopy.tools import is_integer +__doc__ = """ +.. currentmodule:: loopy.kernel.array + +.. autoclass:: ArrayDimImplementationTag + +.. autoclass:: _StrideArrayDimTagBase + +.. autoclass:: FixedStrideArrayDimTag + +.. autoclass:: ComputedStrideArrayDimTag + +.. autoclass:: SeparateArrayArrayDimTag + +.. autoclass:: VectorArrayDimTag + +.. autofunction:: parse_array_dim_tags +""" + + # {{{ array dimension tags class ArrayDimImplementationTag(ImmutableRecord): @@ -69,9 +83,8 @@ class _StrideArrayDimTagBase(ArrayDimImplementationTag): The lowest nesting level varies fastest when viewed in linear memory. - May be None on :class:`FixedStrideArrayDimTag`, in which - case no :class:`ComputedStrideArrayDimTag` instances may - occur. + May be None on :class:`FixedStrideArrayDimTag`, in which case no + :class:`ComputedStrideArrayDimTag` instances may occur. """ @@ -132,8 +145,8 @@ class ComputedStrideArrayDimTag(_StrideArrayDimTagBase): :attr:`ArrayBase.dtype` granularity to which to pad this dimension - This type of stride arg dim gets converted to :class:`FixedStrideArrayDimTag` - on input to :class:`ArrayBase` subclasses. + This type of stride arg dim gets converted to + :class:`FixedStrideArrayDimTag` on input to :class:`ArrayBase` subclasses. """ def __init__(self, layout_nesting_level, pad_to=None, target_axis=0, ): @@ -304,7 +317,7 @@ def parse_array_dim_tags(dim_tags, n_axes=None, use_increasing_target_axes=False assert n_axes == len(dim_names) dim_tags = [None]*n_axes - for dim_name, val in six.iteritems(dim_tags_dict): + for dim_name, val in dim_tags_dict.items(): try: dim_idx = dim_names.index(dim_name) except ValueError: @@ -370,7 +383,7 @@ def parse_array_dim_tags(dim_tags, n_axes=None, use_increasing_target_axes=False # {{{ check contiguity of nesting levels - for target_axis, ta_nesting_levels in iteritems(nesting_levels): + for target_axis, ta_nesting_levels in nesting_levels.items(): if sorted(ta_nesting_levels) != list( range( min(ta_nesting_levels), @@ -653,7 +666,7 @@ class ArrayBase(ImmutableRecord): or a string which can be parsed into the previous form. :arg dim_tags: A comma-separated list of tags as understood by - :func:`parse_array_dim_tag`. + :func:`loopy.kernel.array.parse_array_dim_tags`. :arg strides: May be one of the following: @@ -881,7 +894,7 @@ class ArrayBase(ImmutableRecord): if self.dim_names is not None: info_entries.append("shape: (%s)" % ", ".join( - "%s:%s" % (n, i) + f"{n}:{i}" for n, i in zip(self.dim_names, self.shape))) else: info_entries.append("shape: (%s)" @@ -895,7 +908,7 @@ class ArrayBase(ImmutableRecord): if self.offset: info_entries.append("offset: %s" % self.offset) - return "%s: %s" % (self.name, ", ".join(info_entries)) + return "{}: {}".format(self.name, ", ".join(info_entries)) def __str__(self): return self.stringify(include_typename=True) @@ -935,7 +948,8 @@ class ArrayBase(ImmutableRecord): return len(target_axes) def num_user_axes(self, require_answer=True): - if self.shape is not None: + from loopy import auto + if self.shape not in (None, auto): return len(self.shape) if self.dim_tags is not None: return len(self.dim_tags) @@ -1088,8 +1102,7 @@ class ArrayBase(ImmutableRecord): offset_for_name=full_name, is_written=False) - for sa in stride_args: - yield sa + yield from stride_args # }}} @@ -1115,13 +1128,12 @@ class ArrayBase(ImmutableRecord): new_stride_arg_axes = stride_arg_axes new_stride_axis = dim_tag.stride - for res in gen_decls(name_suffix, + yield from gen_decls(name_suffix, shape + (new_shape_axis,), strides + (new_stride_axis,), unvec_shape + (new_shape_axis,), unvec_strides + (new_stride_axis,), new_stride_arg_axes, - dtype, user_index + (None,)): - yield res + dtype, user_index + (None,)) elif isinstance(dim_tag, SeparateArrayArrayDimTag): shape_i = array_shape[user_axis] @@ -1131,11 +1143,10 @@ class ArrayBase(ImmutableRecord): self.name, user_axis)) for i in range(shape_i): - for res in gen_decls(name_suffix + "_s%d" % i, + yield from gen_decls(name_suffix + "_s%d" % i, shape, strides, unvec_shape, unvec_strides, stride_arg_axes, dtype, - user_index + (i,)): - yield res + user_index + (i,)) elif isinstance(dim_tag, VectorArrayDimTag): shape_i = array_shape[user_axis] @@ -1144,26 +1155,24 @@ class ArrayBase(ImmutableRecord): "integer axis %d (0-based)" % ( self.name, user_axis)) - for res in gen_decls(name_suffix, + yield from gen_decls(name_suffix, shape, strides, unvec_shape + (shape_i,), # vectors always have stride 1 unvec_strides + (1,), stride_arg_axes, target.vector_dtype(dtype, shape_i), - user_index + (None,)): - yield res + user_index + (None,)) else: raise LoopyError("unsupported array dim implementation tag '%s' " "in array '%s'" % (dim_tag, self.name)) - for res in gen_decls(name_suffix="", + yield from gen_decls(name_suffix="", shape=(), strides=(), unvec_shape=(), unvec_strides=(), stride_arg_axes=(), - dtype=self.dtype, user_index=()): - yield res + dtype=self.dtype, user_index=()) @memoize_method def sep_shape(self): @@ -1194,11 +1203,10 @@ class ArrayBase(ImmutableRecord): else: return idx - from pytools import indices_in_shape return [ (unwrap_1d_indices(i), self.name + "".join("_s%d" % sub_i for sub_i in i)) - for i in indices_in_shape(sep_shape)] + for i in np.ndindex(sep_shape)] # }}} diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index e0834ba9dfbc2d68c063623f77889f04b977b156..0f7a0deff491c68ad171c79c4775deafafac2dfc 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1,6 +1,5 @@ """UI for kernel creation.""" -from __future__ import division, absolute_import, print_function __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -43,8 +42,7 @@ import islpy as isl from islpy import dim_type from pytools import ProcessLogger -import six -from six.moves import range, zip, intern +from sys import intern import loopy.version import re @@ -202,7 +200,7 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None): raise ValueError( "unknown scope for nosync option: '%s' " "(allowable scopes are %s)" % - (scope, ', '.join("'%s'" % s for s in allowable_scopes))) + (scope, ", ".join("'%s'" % s for s in allowable_scopes))) return _NosyncParseResult(expr, scope) for option in options_str.split(","): @@ -363,7 +361,7 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None): elif opt_key == "mem_kind": opt_value = opt_value.lower().strip() - if opt_value not in ['local', 'global']: + if opt_value not in ["local", "global"]: raise LoopyError("Unknown memory synchronization type %s specified" " expected, 'local' or 'global'." % opt_value) @@ -439,13 +437,13 @@ SUBST_RE = re.compile( def check_illegal_options(insn_options, insn_type): illegal_options = [] - if insn_type not in ['gbarrier', 'lbarrier']: - illegal_options.append('mem_kind') + if insn_type not in ["gbarrier", "lbarrier"]: + illegal_options.append("mem_kind") bad_options = [x for x in illegal_options if x in insn_options] if bad_options: raise LoopyError("Cannot supply option(s) '%s' to instruction type '%s'" % - ', '.join(bad_options), insn_type) + ", ".join(bad_options), insn_type) def parse_insn(groups, insn_options): @@ -520,7 +518,7 @@ def parse_insn(groups, insn_options): assignee_names=assignee_names) # check for bad options - check_illegal_options(insn_options, 'assignment') + check_illegal_options(insn_options, "assignment") insn_id = insn_options.pop("insn_id", None) inames_to_dup = insn_options.pop("inames_to_dup", []) @@ -761,8 +759,8 @@ def parse_instructions(instructions, defines): insn_options_stack = [get_default_insn_options_dict()] if_predicates_stack = [ - {'predicates': frozenset(), - 'insn_predicates': frozenset()}] + {"predicates": frozenset(), + "insn_predicates": frozenset()}] for insn in instructions: if isinstance(insn, InstructionBase): @@ -823,7 +821,7 @@ def parse_instructions(instructions, defines): insn_options_stack[-1], with_options_match.group("options"))) # check for bad options - check_illegal_options(insn_options_stack[-1], 'with-block') + check_illegal_options(insn_options_stack[-1], "with-block") continue for_match = FOR_RE.match(insn) @@ -863,7 +861,7 @@ def parse_instructions(instructions, defines): #add to the if_stack if_options = options.copy() - if_options['insn_predicates'] = options["predicates"] + if_options["insn_predicates"] = options["predicates"] if_predicates_stack.append(if_options) del options del predicate @@ -927,9 +925,9 @@ def parse_instructions(instructions, defines): if insn == "end": obj = insn_options_stack.pop() #if this object is the end of an if statement - if obj['predicates'] == if_predicates_stack[-1]["insn_predicates"] and\ + if obj["predicates"] == if_predicates_stack[-1]["insn_predicates"] and\ if_predicates_stack[-1]["insn_predicates"] and\ - obj['within_inames'] == if_predicates_stack[-1]['within_inames']: + obj["within_inames"] == if_predicates_stack[-1]["within_inames"]: if_predicates_stack.pop() continue @@ -991,8 +989,8 @@ def _find_inames_in_set(dom_str): if match is None: raise RuntimeError("invalid syntax for domain '%s'" % dom_str) - result = set(iname.strip() for iname in match.group(1).split(",") - if iname.strip()) + result = {iname.strip() for iname in match.group(1).split(",") + if iname.strip()} return result @@ -1001,7 +999,7 @@ EX_QUANT_RE = re.compile(r"\bexists\s+([a-zA-Z0-9])\s*\:") def _find_existentially_quantified_inames(dom_str): - return set(ex_quant.group(1) for ex_quant in EX_QUANT_RE.finditer(dom_str)) + return {ex_quant.group(1) for ex_quant in EX_QUANT_RE.finditer(dom_str)} def parse_domains(domains, defines): @@ -1020,7 +1018,7 @@ def parse_domains(domains, defines): parameters = (_gather_isl_identifiers(dom) - _find_inames_in_set(dom) - _find_existentially_quantified_inames(dom)) - dom = "[%s] -> %s" % (",".join(sorted(parameters)), dom) + dom = "[{}] -> {}".format(",".join(sorted(parameters)), dom) try: dom = isl.BasicSet.read_from_str(isl.DEFAULT_CONTEXT, dom) @@ -1182,7 +1180,7 @@ class ArgumentGuesser: # {{{ find names that are *not* arguments - temp_var_names = set(six.iterkeys(self.temporary_variables)) + temp_var_names = set(self.temporary_variables.keys()) for insn in self.instructions: if isinstance(insn, MultiAssignmentBase): @@ -1276,8 +1274,8 @@ def check_for_multiple_writes_to_loop_bounds(knl): def check_written_variable_names(knl): admissible_vars = ( - set(arg.name for arg in knl.args) - | set(six.iterkeys(knl.temporary_variables))) + {arg.name for arg in knl.args} + | set(knl.temporary_variables.keys())) for insn in knl.instructions: for var_name in insn.assignee_var_names(): @@ -1298,7 +1296,7 @@ class CSEToAssignmentMapper(IdentityMapper): def map_reduction(self, expr, additional_inames): additional_inames = additional_inames | frozenset(expr.inames) - return super(CSEToAssignmentMapper, self).map_reduction( + return super().map_reduction( expr, additional_inames) def map_common_subexpression(self, expr, additional_inames): @@ -1521,7 +1519,7 @@ def determine_shapes_of_temporaries(knl): vars_needing_shape_inference = set() - for tv in six.itervalues(knl.temporary_variables): + for tv in knl.temporary_variables.values(): if tv.shape is lp.auto or tv.base_indices is lp.auto: vars_needing_shape_inference.add(tv.name) @@ -1539,8 +1537,7 @@ def determine_shapes_of_temporaries(knl): if len(var_to_error) > 0: vars_needing_shape_inference = set(var_to_error.keys()) - from six import iteritems - for varname, err in iteritems(var_to_error): + for varname, err in var_to_error.items(): warn_with_kernel(knl, "temp_shape_fallback", "Had to fall back to legacy method of determining " "shape of temporary '%s' because: %s" @@ -1558,7 +1555,7 @@ def determine_shapes_of_temporaries(knl): if len(var_to_error) > 0: # No way around errors: propagate an exception upward. formatted_errors = ( - "\n\n".join("'%s': %s" % (varname, var_to_error[varname]) + "\n\n".join("'{}': {}".format(varname, var_to_error[varname]) for varname in sorted(var_to_error.keys()))) raise LoopyError("got the following exception(s) trying to find the " @@ -1571,7 +1568,7 @@ def determine_shapes_of_temporaries(knl): new_temp_vars = {} - for tv in six.itervalues(knl.temporary_variables): + for tv in knl.temporary_variables.values(): if tv.base_indices is lp.auto: tv = tv.copy(base_indices=var_to_base_indices[tv.name]) if tv.shape is lp.auto: @@ -1600,7 +1597,7 @@ def expand_defines_in_shapes(kernel, defines): processed_args.append(arg) processed_temp_vars = {} - for tv in six.itervalues(kernel.temporary_variables): + for tv in kernel.temporary_variables.values(): processed_temp_vars[tv.name] = tv.map_exprs(expr_map) return kernel.copy( @@ -1763,13 +1760,13 @@ def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): writer_map = kernel.writer_map() - arg_names = set(arg.name for arg in kernel.args) + arg_names = {arg.name for arg in kernel.args} - var_names = arg_names | set(six.iterkeys(kernel.temporary_variables)) + var_names = arg_names | set(kernel.temporary_variables.keys()) - dep_map = dict( - (insn.id, insn.read_dependency_names() & var_names) - for insn in expanded_kernel.instructions) + dep_map = { + insn.id: insn.read_dependency_names() & var_names + for insn in expanded_kernel.instructions} new_insns = [] for insn in kernel.instructions: @@ -1793,7 +1790,7 @@ def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): if len(var_writers) == 1: auto_deps.update( var_writers - - set([insn.id])) + - {insn.id}) # }}} @@ -2128,7 +2125,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): breaking language changes *will* apply to your kernel without asking, likely breaking your code.) - If not given, this value defaults to version **(2017, 2, 1)** and + If not given, this value defaults to version **(2018, 2)** and a warning will be issued. To set the kernel version for all :mod:`loopy` kernels in a (Python) source @@ -2194,9 +2191,9 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): from loopy.version import LANGUAGE_VERSION_SYMBOLS - version_to_symbol = dict( - (getattr(loopy.version, lvs), lvs) - for lvs in LANGUAGE_VERSION_SYMBOLS) + version_to_symbol = { + getattr(loopy.version, lvs): lvs + for lvs in LANGUAGE_VERSION_SYMBOLS} lang_version = kwargs.pop("lang_version", None) if lang_version is None: @@ -2236,11 +2233,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): lang_version = FALLBACK_LANGUAGE_VERSION if lang_version not in version_to_symbol: - raise LoopyError("Language version '%s' is not known." % (lang_version,)) - if lang_version >= (2018, 1): - options = options.copy(enforce_variable_access_ordered=True) - if lang_version >= (2018, 2): - options = options.copy(ignore_boostable_into=True) + raise LoopyError(f"Language version '{lang_version}' is not known.") # }}} @@ -2398,7 +2391,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): def make_function(*args, **kwargs): - kwargs['is_callee_kernel'] = True + kwargs["is_callee_kernel"] = True return make_kernel(*args, **kwargs) # }}} diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 6c0fa0a303d22fa931fe797ff3653d2819d4aa8d..073dc6f6579e005f7c627412ff763d6f019f95fd 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -1,6 +1,5 @@ """Data used by the kernel object.""" -from __future__ import division __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -25,7 +24,7 @@ THE SOFTWARE. """ -from six.moves import intern +from sys import intern import numpy as np # noqa from pytools import ImmutableRecord from loopy.kernel.array import ArrayBase @@ -45,8 +44,30 @@ from loopy.kernel.instruction import ( # noqa CInstruction) from warnings import warn +__doc__ = """ +.. currentmodule:: loopy.kernel.data -class auto(object): # noqa +.. autofunction:: filter_iname_tags_by_type + +.. autoclass:: IndexTag + +.. autoclass:: ConcurrentTag + +.. autoclass:: UniqueTag + +.. autoclass:: AxisTag + +.. autoclass:: LocalIndexTag + +.. autoclass:: GroupIndexTag + +.. autoclass:: VectorizeTag + +.. autoclass:: UnrollTag +""" + + +class auto: # noqa """A generic placeholder object for something that should be automatically determined. See, for example, the *shape* or *strides* argument of :class:`ArrayArg`. @@ -67,7 +88,7 @@ def filter_iname_tags_by_type(tags, tag_type, max_num=None, min_num=None): :arg min_num: the minimum number of tags expected to be found. """ - result = set(tag for tag in tags if isinstance(tag, tag_type)) + result = {tag for tag in tags if isinstance(tag, tag_type)} def strify_tag_type(): if isinstance(tag_type, tuple): @@ -77,12 +98,12 @@ def filter_iname_tags_by_type(tags, tag_type, max_num=None, min_num=None): if max_num is not None: if len(result) > max_num: - raise LoopyError("cannot have more than {0} tags " - "of type(s): {1}".format(max_num, strify_tag_type())) + raise LoopyError("cannot have more than {} tags " + "of type(s): {}".format(max_num, strify_tag_type())) if min_num is not None: if len(result) < min_num: - raise LoopyError("must have more than {0} tags " - "of type(s): {1}".format(max_num, strify_tag_type())) + raise LoopyError("must have more than {} tags " + "of type(s): {}".format(max_num, strify_tag_type())) return result @@ -244,7 +265,7 @@ def parse_tag(tag): # {{{ memory address space -class AddressSpace(object): +class AddressSpace: """Storage location of a variable. .. attribute:: PRIVATE @@ -271,7 +292,7 @@ class AddressSpace(object): raise ValueError("unexpected value of AddressSpace") -class _deprecated_temp_var_scope_class_method(object): # noqa +class _deprecated_temp_var_scope_class_method: # noqa def __init__(self, f): self.f = f @@ -281,8 +302,8 @@ class _deprecated_temp_var_scope_class_method(object): # noqa return self.f() -class temp_var_scope(object): # noqa - """Deprecated. Use :class:`AddressSpace` instead. +class temp_var_scope: # noqa + """Deprecated. Use :class:`loopy.AddressSpace` instead. """ @_deprecated_temp_var_scope_class_method @@ -318,8 +339,8 @@ class KernelArgument(ImmutableRecord): dtype = kwargs.pop("dtype", None) - if 'for_atomic' in kwargs: - for_atomic = kwargs['for_atomic'] + if "for_atomic" in kwargs: + for_atomic = kwargs["for_atomic"] else: for_atomic = False @@ -384,7 +405,7 @@ class ArrayArg(ArrayBase, KernelArgument): kwargs["is_output"] = kwargs.pop("is_output", None) kwargs["is_input"] = kwargs.pop("is_input", None) - super(ArrayArg, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) min_target_axes = 0 max_target_axes = 1 @@ -408,7 +429,7 @@ class ArrayArg(ArrayBase, KernelArgument): """Custom hash computation function for use with :class:`pytools.persistent_dict.PersistentDict`. """ - super(ArrayArg, self).update_persistent_hash(key_hash, key_builder) + super().update_persistent_hash(key_hash, key_builder) key_builder.rec(key_hash, self.address_space) key_builder.rec(key_hash, self.is_output) key_builder.rec(key_hash, self.is_input) @@ -474,7 +495,7 @@ class ValueArg(KernelArgument): else: type_str = str(self.dtype) - return "%s: ValueArg, type: %s" % (self.name, type_str) + return f"{self.name}: ValueArg, type: {type_str}" def __repr__(self): return "<%s>" % self.__str__() @@ -550,7 +571,7 @@ class TemporaryVariable(ArrayBase): "_base_storage_access_may_be_aliasing", ] - def __init__(self, name, dtype=None, shape=(), address_space=None, + def __init__(self, name, dtype=None, shape=auto, address_space=None, dim_tags=None, offset=0, dim_names=None, strides=None, order=None, base_indices=None, storage_shape=None, base_storage=None, initializer=None, read_only=False, @@ -604,7 +625,10 @@ class TemporaryVariable(ArrayBase): if shape is auto: shape = initializer.shape - + else: + if shape != initializer.shape: + raise LoopyError("Shape of '{}' does not match that of the" + " initializer.".format(name)) else: raise LoopyError( "temporary variable '%s': " @@ -614,7 +638,7 @@ class TemporaryVariable(ArrayBase): if order is None: order = "C" - if base_indices is None: + if base_indices is None and shape is not auto: base_indices = (0,) * len(shape) if not read_only and initializer is not None: @@ -680,7 +704,7 @@ class TemporaryVariable(ArrayBase): if address_space is not None: kwargs["address_space"] = address_space - return super(TemporaryVariable, self).copy(**kwargs) + return super().copy(**kwargs) @property def nbytes(self): @@ -692,7 +716,7 @@ class TemporaryVariable(ArrayBase): return product(si for si in shape)*self.dtype.itemsize def decl_info(self, target, index_dtype): - return super(TemporaryVariable, self).decl_info( + return super().decl_info( target, is_written=True, index_dtype=index_dtype, shape_override=self.storage_shape) @@ -717,7 +741,7 @@ class TemporaryVariable(ArrayBase): def __eq__(self, other): return ( - super(TemporaryVariable, self).__eq__(other) + super().__eq__(other) and self.storage_shape == other.storage_shape and self.base_indices == other.base_indices and self.address_space == other.address_space @@ -735,7 +759,7 @@ class TemporaryVariable(ArrayBase): :class:`pytools.persistent_dict.PersistentDict`. """ - super(TemporaryVariable, self).update_persistent_hash(key_hash, key_builder) + super().update_persistent_hash(key_hash, key_builder) self.update_persistent_hash_for_shape(key_hash, key_builder, self.storage_shape) key_builder.rec(key_hash, self.base_indices) @@ -783,7 +807,7 @@ class SubstitutionRule(ImmutableRecord): name=name, arguments=arguments, expression=expression) def __str__(self): - return "%s(%s) := %s" % ( + return "{}({}) := {}".format( self.name, ", ".join(self.arguments), self.expression) def update_persistent_hash(self, key_hash, key_builder): @@ -809,19 +833,19 @@ class CallMangleInfo(ImmutableRecord): .. attribute:: result_dtypes - A tuple of :class:`LoopyType` instances indicating what + A tuple of :class:`loopy.types.LoopyType` instances indicating what types of values the function returns. .. attribute:: arg_dtypes - A tuple of :class:`LoopyType` instances indicating what + A tuple of :class:`loopy.types.LoopyType` instances indicating what types of arguments the function actually receives. """ def __init__(self, target_name, result_dtypes, arg_dtypes): assert isinstance(result_dtypes, tuple) - super(CallMangleInfo, self).__init__( + super().__init__( target_name=target_name, result_dtypes=result_dtypes, arg_dtypes=arg_dtypes) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index a1e2213020d50e8e564214e3ecddb75acc065c6b..f48e8852f0fb756142f505ed76798760251e4674 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2018 Andreas Kloeckner, Kaushik Kulkarni" __license__ = """ @@ -22,10 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - -from six.moves import zip import islpy as isl - from pytools import ImmutableRecord from loopy.diagnostic import LoopyError @@ -83,7 +78,7 @@ class ArrayArgDescriptor(ImmutableRecord): A tuple of instances of :class:`loopy.kernel.array._StrideArrayDimTagBase` """ - fields = set(['shape', 'address_space', 'dim_tags']) + fields = {"shape", "address_space", "dim_tags"} def __init__(self, shape, address_space, dim_tags): @@ -100,7 +95,7 @@ class ArrayArgDescriptor(ImmutableRecord): # }}} - super(ArrayArgDescriptor, self).__init__( + super().__init__( shape=shape, address_space=address_space, dim_tags=dim_tags) @@ -266,7 +261,7 @@ class GridOverrideForCalleeKernel(ImmutableRecord): This class acts as a pseudo-callable and its significance lies in solving picklability issues. """ - fields = set(["local_size", "global_size"]) + fields = {"local_size", "global_size"} def __init__(self, global_size, local_size): self.global_size = global_size @@ -319,12 +314,12 @@ class InKernelCallable(ImmutableRecord): .. automethod:: is_ready_for_codegen """ - fields = set(["arg_id_to_dtype", "arg_id_to_descr"]) + fields = {"arg_id_to_dtype", "arg_id_to_descr"} init_arg_names = ("arg_id_to_dtype", "arg_id_to_descr") def __init__(self, arg_id_to_dtype=None, arg_id_to_descr=None): - super(InKernelCallable, self).__init__( + super().__init__( arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr) @@ -396,8 +391,8 @@ class InKernelCallable(ImmutableRecord): new_arg_id_to_dtype = None if self.arg_id_to_dtype is not None: - new_arg_id_to_dtype = dict((id, with_target_if_not_None(dtype)) for id, - dtype in self.arg_id_to_dtype.items()) + new_arg_id_to_dtype = {id: with_target_if_not_None(dtype) for id, + dtype in self.arg_id_to_dtype.items()} return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) @@ -463,7 +458,7 @@ class ScalarCallable(InKernelCallable): derived subclasses. """ - fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"]) + fields = {"name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"} init_arg_names = ("name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target") hash_fields = fields @@ -471,7 +466,7 @@ class ScalarCallable(InKernelCallable): def __init__(self, name, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None): - super(ScalarCallable, self).__init__( + super().__init__( arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr) @@ -629,7 +624,7 @@ class CallableKernel(InKernelCallable): sizes for the :attr:`subkernel` of the callable. """ - fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr"]) + fields = {"subkernel", "arg_id_to_dtype", "arg_id_to_descr"} init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr") hash_fields = fields @@ -637,7 +632,7 @@ class CallableKernel(InKernelCallable): arg_id_to_descr=None): assert isinstance(subkernel, LoopKernel) - super(CallableKernel, self).__init__( + super().__init__( arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr) @@ -731,8 +726,8 @@ class CallableKernel(InKernelCallable): subst_mapper = SubstitutionMapper(subst_func) - arg_id_to_descr = dict((arg_id, descr.map_expr(subst_mapper)) for - arg_id, descr in arg_id_to_descr.items()) + arg_id_to_descr = {arg_id: descr.map_expr(subst_mapper) for + arg_id, descr in arg_id_to_descr.items()} # }}} @@ -795,8 +790,8 @@ class CallableKernel(InKernelCallable): callables_table)) if assumptions: - args_added_knl = assume(args_added_knl, ' and '.join([ - '{0}={1}'.format(key, val) for key, val in assumptions.items()])) + args_added_knl = assume(args_added_knl, " and ".join([ + f"{key}={val}" for key, val in assumptions.items()])) return ( self.copy( @@ -904,19 +899,19 @@ class ManglerCallable(ScalarCallable): A function of signature ``(kernel, name , arg_dtypes)`` and returns an instance of ``loopy.CallMangleInfo``. """ - fields = set(["name", "function_mangler", "arg_id_to_dtype", "arg_id_to_descr", - "name_in_target"]) + fields = {"name", "function_mangler", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target"} init_arg_names = ("name", "function_mangler", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target") - hash_fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", - "name_in_target"]) + hash_fields = {"name", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target"} def __init__(self, name, function_mangler, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None): self.function_mangler = function_mangler - super(ManglerCallable, self).__init__( + super().__init__( name=name, arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr, @@ -945,8 +940,8 @@ class ManglerCallable(ScalarCallable): arg_dtypes) if mangle_result: new_arg_id_to_dtype = dict(enumerate(mangle_result.arg_dtypes)) - new_arg_id_to_dtype.update(dict((-i-1, dtype) for i, dtype in - enumerate(mangle_result.result_dtypes))) + new_arg_id_to_dtype.update({-i-1: dtype for i, dtype in + enumerate(mangle_result.result_dtypes)}) return ( self.copy(name_in_target=mangle_result.target_name, arg_id_to_dtype=new_arg_id_to_dtype), diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index c5599863c9260086015e53efc413faf667a80738..6c1fa64e3afcbf86febf4511053c57b261238228 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import, print_function - __copyright__ = "Copyright (C) 2016 Andreas Kloeckner" __license__ = """ @@ -22,8 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six -from six.moves import intern +from sys import intern from pytools import ImmutableRecord, memoize_method from loopy.diagnostic import LoopyError from loopy.tools import Optional @@ -39,14 +36,14 @@ class InstructionBase(ImmutableRecord): .. attribute:: id An (otherwise meaningless) identifier that is unique within - a :class:`loopy.kernel.LoopKernel`. + a :class:`loopy.LoopKernel`. .. rubric:: Instruction ordering .. attribute:: depends_on - a :class:`frozenset` of :attr:`id` values of :class:`Instruction` instances - that *must* be executed before this one. Note that + a :class:`frozenset` of :attr:`id` values of :class:`InstructionBase` + instances that *must* be executed before this one. Note that :func:`loopy.preprocess_kernel` (usually invoked automatically) augments this by adding dependencies on any writes to temporaries read by this instruction. @@ -82,7 +79,7 @@ class InstructionBase(ImmutableRecord): .. attribute:: conflicts_with_groups A :class:`frozenset` of strings indicating which instruction groups - (see :class:`InstructionBase.groups`) may not be active when this + (see :attr:`groups`) may not be active when this instruction is scheduled. .. attribute:: priority @@ -95,7 +92,7 @@ class InstructionBase(ImmutableRecord): .. attribute:: no_sync_with a :class:`frozenset` of tuples of the form ``(insn_id, scope)``, where - `insn_id` refers to :attr:`id` of :class:`Instruction` instances + ``insn_id`` refers to :attr:`id` of :class:`InstructionBase` instances and `scope` is one of the following strings: - `"local"` @@ -114,7 +111,7 @@ class InstructionBase(ImmutableRecord): and match expression, just like :attr:`depends_on`. This data is used specifically by barrier insertion and - :func:`loopy.check.enforce_variable_access_ordered`. + :func:`loopy.check.check_variable_access_ordered`. .. rubric:: Conditionals @@ -152,51 +149,27 @@ class InstructionBase(ImmutableRecord): .. automethod:: copy """ - # within_inames_is_final, boostable and boostable_into are deprecated and - # will be removed in version 2017.x. + # within_inames_is_final is deprecated and will be removed in version 2017.x. fields = set("id depends_on depends_on_is_final " "groups conflicts_with_groups " "no_sync_with " "predicates " "within_inames_is_final within_inames " - "priority boostable boostable_into".split()) + "priority".split()) # Names of fields that are pymbolic expressions. Needed for key building pymbolic_fields = set("") # Names of fields that are sets of pymbolic expressions. Needed for key building - pymbolic_set_fields = set(["predicates"]) + pymbolic_set_fields = {"predicates"} def __init__(self, id, depends_on, depends_on_is_final, groups, conflicts_with_groups, no_sync_with, within_inames_is_final, within_inames, priority, - boostable, boostable_into, predicates, tags, - insn_deps=None, insn_deps_is_final=None, - forced_iname_deps=None, forced_iname_deps_is_final=None): - - # {{{ backwards compatibility goop - - if depends_on is not None and insn_deps is not None: - raise LoopyError("may not specify both insn_deps and depends_on") - elif insn_deps is not None: - warn("insn_deps is deprecated, use depends_on", - DeprecationWarning, stacklevel=2) - - depends_on = insn_deps - depends_on_is_final = insn_deps_is_final - - if forced_iname_deps is not None and within_inames is not None: - raise LoopyError("may not specify both forced_iname_deps " - "and within_inames") - elif forced_iname_deps is not None: - warn("forced_iname_deps is deprecated, use within_inames", - DeprecationWarning, stacklevel=2) - - within_inames = forced_iname_deps - within_inames_is_final = forced_iname_deps_is_final + predicates, tags): if predicates is None: predicates = frozenset() @@ -218,8 +191,6 @@ class InstructionBase(ImmutableRecord): predicates = frozenset(new_predicates) del new_predicates - # }}} - if depends_on is None: depends_on = frozenset() @@ -284,42 +255,9 @@ class InstructionBase(ImmutableRecord): within_inames_is_final=within_inames_is_final, within_inames=within_inames, priority=priority, - boostable=boostable, - boostable_into=boostable_into, predicates=predicates, tags=tags) - # {{{ backwards compatibility goop - - @property - def insn_deps(self): - warn("insn_deps is deprecated, use depends_on", - DeprecationWarning, stacklevel=2) - - return self.depends_on - - # legacy - @property - def insn_deps_is_final(self): - warn("insn_deps_is_final is deprecated, use depends_on_is_final", - DeprecationWarning, stacklevel=2) - - return self.depends_on_is_final - - @property - def forced_iname_deps(self): - warn("forced_iname_deps is deprecated, use within_inames", - DeprecationWarning, stacklevel=2) - return self.within_inames - - @property - def forced_iname_deps_is_final(self): - warn("forced_iname_deps_is_final is deprecated, use within_inames_is_final", - DeprecationWarning, stacklevel=2) - return self.within_inames_is_final - - # }}} - # {{{ abstract interface def read_dependency_names(self): @@ -346,10 +284,13 @@ class InstructionBase(ImmutableRecord): """ raise NotImplementedError - def with_transformed_expressions(self, f, *args): + def with_transformed_expressions(self, f, assignee_f=None): """Return a new copy of *self* where *f* has been applied to every expression occurring in *self*. *args* will be passed as extra arguments (in addition to the expression) to *f*. + + If *assignee_f* is passed, then left-hand sides of assignments are + passed to it. If it is not given, it defaults to the same as *f*. """ raise NotImplementedError @@ -393,18 +334,6 @@ class InstructionBase(ImmutableRecord): def get_str_options(self): result = [] - if self.boostable is True: - if self.boostable_into: - result.append("boostable into '%s'" % ",".join(self.boostable_into)) - else: - result.append("boostable") - elif self.boostable is False: - result.append("not boostable") - elif self.boostable is None: - pass - else: - raise RuntimeError("unexpected value for Instruction.boostable") - if self.depends_on: result.append("dep="+":".join(self.depends_on)) if self.no_sync_with: @@ -466,23 +395,8 @@ class InstructionBase(ImmutableRecord): # }}} - def copy(self, **kwargs): - if "insn_deps" in kwargs: - warn("insn_deps is deprecated, use depends_on", - DeprecationWarning, stacklevel=2) - - kwargs["depends_on"] = kwargs.pop("insn_deps") - - if "insn_deps_is_final" in kwargs: - warn("insn_deps_is_final is deprecated, use depends_on", - DeprecationWarning, stacklevel=2) - - kwargs["depends_on_is_final"] = kwargs.pop("insn_deps_is_final") - - return super(InstructionBase, self).copy(**kwargs) - def __setstate__(self, val): - super(InstructionBase, self).__setstate__(val) + super().__setstate__(val) from loopy.tools import intern_frozenset_of_ids @@ -582,7 +496,7 @@ class MemoryOrdering: # noqa # {{{ memory_ordering, MemoryOrdering compatibility -class _deprecated_memory_ordering_class_method(object): # noqa +class _deprecated_memory_ordering_class_method: # noqa def __init__(self, f): self.f = f @@ -592,7 +506,7 @@ class _deprecated_memory_ordering_class_method(object): # noqa return self.f() -class memory_ordering(object): # noqa +class memory_ordering: # noqa """Deprecated. Use :class:`MemoryOrdering` instead. """ @@ -659,7 +573,7 @@ class MemoryScope: # noqa # {{{ memory_scope, MemoryScope compatiability -class _deprecated_memory_scope_class_method(object): # noqa +class _deprecated_memory_scope_class_method: # noqa def __init__(self, f): self.f = f @@ -669,7 +583,7 @@ class _deprecated_memory_scope_class_method(object): # noqa return self.f() -class memory_scope(object): # noqa +class memory_scope: # noqa """Deprecated. Use :class:`MemoryScope` instead. """ @@ -702,7 +616,7 @@ class memory_scope(object): # noqa # }}} -class VarAtomicity(object): +class VarAtomicity: """A base class for the description of how atomic access to :attr:`var_name` shall proceed. @@ -747,13 +661,13 @@ class OrderedAtomic(VarAtomicity): :class:`pytools.persistent_dict.PersistentDict`. """ - super(OrderedAtomic, self).update_persistent_hash(key_hash, key_builder) + super().update_persistent_hash(key_hash, key_builder) key_builder.rec(key_hash, str(self.__class__.__name__)) key_builder.rec(key_hash, self.ordering) key_builder.rec(key_hash, self.scope) def __eq__(self, other): - return (super(OrderedAtomic, self).__eq__(other) + return (super().__eq__(other) and self.ordering == other.ordering and self.scope == other.scope) @@ -762,7 +676,7 @@ class OrderedAtomic(VarAtomicity): raise NotImplementedError def __str__(self): - return "%s[%s]%s/%s" % ( + return "{}[{}]{}/{}".format( self.op_name, self.var_name, MemoryOrdering.to_string(self.ordering), @@ -781,11 +695,12 @@ class AtomicInit(OrderedAtomic): One of the values from :class:`MemoryScope` """ - op_name = 'init' + op_name = "init" class AtomicUpdate(OrderedAtomic): - """Properties of an atomic update. A subclass of :class:`OrderedAtomic`. + """Properties of an atomic update. A subclass of + :class:`OrderedAtomic`. .. attribute:: ordering @@ -795,7 +710,7 @@ class AtomicUpdate(OrderedAtomic): One of the values from :class:`MemoryScope` """ - op_name = 'update' + op_name = "update" class AtomicLoad(OrderedAtomic): @@ -809,7 +724,7 @@ class AtomicLoad(OrderedAtomic): One of the values from :class:`MemoryScope` """ - op_name = 'load' + op_name = "load" # }}} @@ -819,14 +734,14 @@ class AtomicLoad(OrderedAtomic): class MultiAssignmentBase(InstructionBase): """An assignment instruction with an expression as a right-hand side.""" - fields = InstructionBase.fields | set(["expression"]) - pymbolic_fields = InstructionBase.pymbolic_fields | set(["expression"]) + fields = InstructionBase.fields | {"expression"} + pymbolic_fields = InstructionBase.pymbolic_fields | {"expression"} @memoize_method def read_dependency_names(self): from loopy.symbolic import get_dependencies result = ( - super(MultiAssignmentBase, self).read_dependency_names() + super().read_dependency_names() | get_dependencies(self.expression)) for subscript_deps in self.assignee_subscript_deps(): @@ -908,7 +823,7 @@ class Assignment(MultiAssignmentBase): fields = MultiAssignmentBase.fields | \ set("assignee temp_var_type atomicity".split()) - pymbolic_fields = MultiAssignmentBase.pymbolic_fields | set(["assignee"]) + pymbolic_fields = MultiAssignmentBase.pymbolic_fields | {"assignee"} def __init__(self, assignee, expression, @@ -920,13 +835,11 @@ class Assignment(MultiAssignmentBase): no_sync_with=None, within_inames_is_final=None, within_inames=None, - boostable=None, boostable_into=None, tags=None, + tags=None, temp_var_type=Optional(), atomicity=(), - priority=0, predicates=frozenset(), - insn_deps=None, insn_deps_is_final=None, - forced_iname_deps=None, forced_iname_deps_is_final=None): + priority=0, predicates=frozenset()): - super(Assignment, self).__init__( + super().__init__( id=id, depends_on=depends_on, depends_on_is_final=depends_on_is_final, @@ -935,15 +848,9 @@ class Assignment(MultiAssignmentBase): no_sync_with=no_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, - boostable=boostable, - boostable_into=boostable_into, priority=priority, predicates=predicates, - tags=tags, - insn_deps=insn_deps, - insn_deps_is_final=insn_deps_is_final, - forced_iname_deps=forced_iname_deps, - forced_iname_deps_is_final=forced_iname_deps_is_final) + tags=tags) from loopy.symbolic import parse if isinstance(assignee, str): @@ -971,17 +878,20 @@ class Assignment(MultiAssignmentBase): def assignee_subscript_deps(self): return (_get_assignee_subscript_deps(self.assignee),) - def with_transformed_expressions(self, f, *args, **kwargs): + def with_transformed_expressions(self, f, assignee_f=None): + if assignee_f is None: + assignee_f = f + return self.copy( - assignee=f(self.assignee, *args, **kwargs), - expression=f(self.expression, *args, **kwargs), + assignee=assignee_f(self.assignee), + expression=f(self.expression), predicates=frozenset( - f(pred, *args, **kwargs) for pred in self.predicates)) + f(pred) for pred in self.predicates)) # }}} def __str__(self): - result = "%s <- %s" % (self.assignee, self.expression) + result = f"{self.assignee} <- {self.expression}" if self.id is not None: result = "%s: " % self.id + result @@ -1013,7 +923,7 @@ class ExpressionInstruction(Assignment): warn("ExpressionInstruction is deprecated. Use Assignment instead", DeprecationWarning, stacklevel=2) - super(ExpressionInstruction, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) # }}} @@ -1044,7 +954,7 @@ class CallInstruction(MultiAssignmentBase): fields = MultiAssignmentBase.fields | \ set("assignees temp_var_types".split()) - pymbolic_fields = MultiAssignmentBase.pymbolic_fields | set(["assignees"]) + pymbolic_fields = MultiAssignmentBase.pymbolic_fields | {"assignees"} def __init__(self, assignees, expression, @@ -1056,14 +966,11 @@ class CallInstruction(MultiAssignmentBase): no_sync_with=None, within_inames_is_final=None, within_inames=None, - boostable=None, boostable_into=None, tags=None, + tags=None, temp_var_types=None, - priority=0, predicates=frozenset(), - insn_deps=None, insn_deps_is_final=None, - forced_iname_deps=None, - forced_iname_deps_is_final=None): + priority=0, predicates=frozenset()): - super(CallInstruction, self).__init__( + super().__init__( id=id, depends_on=depends_on, depends_on_is_final=depends_on_is_final, @@ -1072,15 +979,9 @@ class CallInstruction(MultiAssignmentBase): no_sync_with=no_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, - boostable=boostable, - boostable_into=boostable_into, priority=priority, predicates=predicates, - tags=tags, - insn_deps=insn_deps, - insn_deps_is_final=insn_deps_is_final, - forced_iname_deps=forced_iname_deps, - forced_iname_deps_is_final=forced_iname_deps_is_final) + tags=tags) from pymbolic.primitives import Call, CallWithKwargs from loopy.symbolic import Reduction @@ -1128,17 +1029,20 @@ class CallInstruction(MultiAssignmentBase): _get_assignee_subscript_deps(a) for a in self.assignees) - def with_transformed_expressions(self, f, *args, **kwargs): + def with_transformed_expressions(self, f, assignee_f=None): + if assignee_f is None: + assignee_f = f + return self.copy( - assignees=f(self.assignees, *args, **kwargs), - expression=f(self.expression, *args, **kwargs), + assignees=assignee_f(self.assignees), + expression=f(self.expression), predicates=frozenset( - f(pred, *args, **kwargs) for pred in self.predicates)) + f(pred) for pred in self.predicates)) # }}} def __str__(self): - result = "%s: %s <- %s" % (self.id, + result = "{}: {} <- {}".format(self.id, ", ".join(str(a) for a in self.assignees), self.expression) @@ -1159,7 +1063,7 @@ class CallInstruction(MultiAssignmentBase): from pymbolic.primitives import CallWithKwargs arg_id_to_val = dict(enumerate(self.expression.parameters)) if isinstance(self.expression, CallWithKwargs): - for kw, val in six.iteritems(self.expression.kw_parameters): + for kw, val in self.expression.kw_parameters.items(): arg_id_to_val[kw] = val for i, arg in enumerate(self.assignees): arg_id_to_val[-i-1] = arg @@ -1338,9 +1242,8 @@ class CInstruction(InstructionBase): groups=None, conflicts_with_groups=None, no_sync_with=None, within_inames_is_final=None, within_inames=None, - priority=0, boostable=None, boostable_into=None, - predicates=frozenset(), tags=None, - insn_deps=None, insn_deps_is_final=None): + priority=0, + predicates=frozenset(), tags=None): """ :arg iname_exprs: Like :attr:`iname_exprs`, but instead of tuples, simple strings pepresenting inames are also allowed. A single @@ -1359,11 +1262,7 @@ class CInstruction(InstructionBase): no_sync_with=no_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, - boostable=boostable, - boostable_into=boostable_into, - priority=priority, predicates=predicates, tags=tags, - insn_deps=insn_deps, - insn_deps_is_final=insn_deps_is_final) + priority=priority, predicates=predicates, tags=tags) # {{{ normalize iname_exprs @@ -1406,7 +1305,7 @@ class CInstruction(InstructionBase): def read_dependency_names(self): result = ( - super(CInstruction, self).read_dependency_names() + super().read_dependency_names() | frozenset(self.read_variables)) from loopy.symbolic import get_dependencies @@ -1429,22 +1328,25 @@ class CInstruction(InstructionBase): _get_assignee_subscript_deps(a) for a in self.assignees) - def with_transformed_expressions(self, f, *args): + def with_transformed_expressions(self, f, assignee_f=None): + if assignee_f is None: + assignee_f = f + return self.copy( iname_exprs=[ - (name, f(expr, *args)) + (name, f(expr)) for name, expr in self.iname_exprs], - assignees=[f(a, *args) for a in self.assignees], + assignees=[assignee_f(a) for a in self.assignees], predicates=frozenset( - f(pred, *args) for pred in self.predicates)) + f(pred) for pred in self.predicates)) # }}} def __str__(self): - first_line = "%s: %s <- CODE(%s|%s)" % (self.id, + first_line = "{}: {} <- CODE({}|{})".format(self.id, ", ".join(str(a) for a in self.assignees), ", ".join(str(x) for x in self.read_variables), - ", ".join("%s=%s" % (name, expr) + ", ".join(f"{name}={expr}" for name, expr in self.iname_exprs)) options = self.get_str_options() @@ -1471,7 +1373,7 @@ class _DataObliviousInstruction(InstructionBase): def assignee_subscript_deps(self): return frozenset() - def with_transformed_expressions(self, f, *args): + def with_transformed_expressions(self, f, assignee_f=None): return self.copy( predicates=frozenset( f(pred) for pred in self.predicates)) @@ -1500,9 +1402,8 @@ class NoOpInstruction(_DataObliviousInstruction): no_sync_with=None, within_inames_is_final=None, within_inames=None, priority=None, - boostable=None, boostable_into=None, predicates=None, tags=None): - super(NoOpInstruction, self).__init__( + super().__init__( id=id, depends_on=depends_on, depends_on_is_final=depends_on_is_final, @@ -1512,8 +1413,6 @@ class NoOpInstruction(_DataObliviousInstruction): within_inames_is_final=within_inames_is_final, within_inames=within_inames, priority=priority, - boostable=boostable, - boostable_into=boostable_into, predicates=predicates, tags=tags) @@ -1554,22 +1453,21 @@ class BarrierInstruction(_DataObliviousInstruction): ... lbarrier {mem_kind=global} """ - fields = _DataObliviousInstruction.fields | set(["synchronization_kind", - "mem_kind"]) + fields = _DataObliviousInstruction.fields | {"synchronization_kind", + "mem_kind"} def __init__(self, id, depends_on=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, no_sync_with=None, within_inames_is_final=None, within_inames=None, priority=None, - boostable=None, boostable_into=None, predicates=None, tags=None, synchronization_kind="global", mem_kind="local"): if predicates: raise LoopyError("conditional barriers are not supported") - super(BarrierInstruction, self).__init__( + super().__init__( id=id, depends_on=depends_on, depends_on_is_final=depends_on_is_final, @@ -1579,8 +1477,6 @@ class BarrierInstruction(_DataObliviousInstruction): within_inames_is_final=within_inames_is_final, within_inames=within_inames, priority=priority, - boostable=boostable, - boostable_into=boostable_into, predicates=predicates, tags=tags ) @@ -1589,12 +1485,13 @@ class BarrierInstruction(_DataObliviousInstruction): self.mem_kind = mem_kind def __str__(self): - first_line = "%s: ... %sbarrier" % (self.id, self.synchronization_kind[0]) + first_line = \ + "{}: ... {}barrier".format(self.id, self.synchronization_kind[0]) options = self.get_str_options() if self.synchronization_kind == "local": # add the memory kind - options += ['mem_kind={}'.format(self.mem_kind)] + options += [f"mem_kind={self.mem_kind}"] if options: first_line += " {%s}" % (": ".join(options)) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index ead996445844e1cc3d09b5a7683b40201dcb6d34..84792cb4b1d137155378026ff21a3accb4680dc5 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1,7 +1,5 @@ -# coding=utf-8 """Operations on the kernel object.""" -from __future__ import division, absolute_import, print_function __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -27,8 +25,7 @@ THE SOFTWARE. import sys -import six -from six.moves import intern +from sys import intern import numpy as np import islpy as isl @@ -60,7 +57,6 @@ def add_dtypes(program, dtype_dict): if dtype_dict_remainder: raise RuntimeError("unused argument dtypes: %s" % ", ".join(dtype_dict_remainder)) - root_kernel root_kernel_with_added_dtypes = ( root_kernel.copy(args=new_args, temporary_variables=new_temp_vars)) @@ -68,21 +64,21 @@ def add_dtypes(program, dtype_dict): return program.with_root_kernel(root_kernel_with_added_dtypes) -def _add_dtypes_overdetermined(knl, dtype_dict): - dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes(knl, dtype_dict) +def _add_dtypes_overdetermined(kernel, dtype_dict): + dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes(kernel, dtype_dict) # do not throw error for unused args - return knl.copy(args=new_args, temporary_variables=new_temp_vars) + return kernel.copy(args=new_args, temporary_variables=new_temp_vars) -def _add_dtypes(knl, dtype_dict): +def _add_dtypes(kernel, dtype_dict): dtype_dict = dtype_dict.copy() new_args = [] from loopy.types import to_loopy_type - for arg in knl.args: + for arg in kernel.args: new_dtype = dtype_dict.pop(arg.name, None) if new_dtype is not None: - new_dtype = to_loopy_type(new_dtype, target=knl.target) + new_dtype = to_loopy_type(new_dtype, target=kernel.target) if arg.dtype is not None and arg.dtype != new_dtype: raise RuntimeError( "argument '%s' already has a different dtype " @@ -92,10 +88,10 @@ def _add_dtypes(knl, dtype_dict): new_args.append(arg) - new_temp_vars = knl.temporary_variables.copy() + new_temp_vars = kernel.temporary_variables.copy() import loopy as lp - for tv_name in knl.temporary_variables: + for tv_name in kernel.temporary_variables: new_dtype = dtype_dict.pop(tv_name, None) if new_dtype is not None: new_dtype = np.dtype(new_dtype) @@ -112,8 +108,8 @@ def _add_dtypes(knl, dtype_dict): return dtype_dict, new_args, new_temp_vars -def get_arguments_with_incomplete_dtype(knl): - return [arg.name for arg in knl.args +def get_arguments_with_incomplete_dtype(kernel): + return [arg.name for arg in kernel.args if arg.dtype is None] @@ -121,7 +117,7 @@ def add_and_infer_dtypes(prog, dtype_dict, expect_completion=False): assert isinstance(prog, Program) processed_dtype_dict = {} - for k, v in six.iteritems(dtype_dict): + for k, v in dtype_dict.items(): for subkey in k.split(","): subkey = subkey.strip() if subkey: @@ -133,11 +129,11 @@ def add_and_infer_dtypes(prog, dtype_dict, expect_completion=False): return infer_unknown_types(prog, expect_completion=expect_completion) -def _add_and_infer_dtypes_overdetermined(knl, dtype_dict): - knl = _add_dtypes_overdetermined(knl, dtype_dict) +def _add_and_infer_dtypes_overdetermined(kernel, dtype_dict): + kernel = _add_dtypes_overdetermined(kernel, dtype_dict) from loopy.type_inference import infer_unknown_types - return infer_unknown_types(knl, expect_completion=True) + return infer_unknown_types(kernel, expect_completion=True) # }}} @@ -300,7 +296,7 @@ def find_all_insn_inames(kernel): logger.debug("%s: find_all_insn_inames: done" % kernel.name) - for v in six.itervalues(insn_id_to_inames): + for v in insn_id_to_inames.values(): assert isinstance(v, frozenset) return insn_id_to_inames @@ -310,53 +306,65 @@ def find_all_insn_inames(kernel): # {{{ set operation cache +def _eliminate_except(set_, except_inames, dts): + return set_.eliminate_except(except_inames, dts) + + +def _get_dim_max(set_, idx): + return set_.dim_max(idx) + + +def _get_dim_min(set_, idx): + return set_.dim_min(idx) + + class SetOperationCacheManager: def __init__(self): - # mapping: set hash -> [(set, op, args, result)] + # mapping: set hash -> [(set, result)] self.cache = {} - def op(self, set, op_name, op, args): - hashval = hash(set) + def op(self, set_, op, args): + hashval = hash((set_, op, args)) bucket = self.cache.setdefault(hashval, []) - for bkt_set, bkt_op, bkt_args, result in bucket: - if set.plain_is_equal(bkt_set) and op == bkt_op and args == bkt_args: + for bkt_set, result in bucket: + if set_.plain_is_equal(bkt_set): return result - #print op, set.get_dim_name(dim_type.set, args[0]) - result = op(set, *args) - bucket.append((set, op_name, args, result)) + result = op(set_, *args) + bucket.append((set_, result)) return result - def dim_min(self, set, *args): - if set.plain_is_empty(): - raise LoopyError("domain '%s' is empty" % set) + def dim_min(self, set_, *args): + if set_.plain_is_empty(): + raise LoopyError("domain '%s' is empty" % set_) + + return self.op(set_, _get_dim_min, args) - from loopy.isl_helpers import dim_min_with_elimination - return self.op(set, "dim_min", dim_min_with_elimination, args) + def dim_max(self, set_, *args): + if set_.plain_is_empty(): + raise LoopyError("domain '%s' is empty" % set_) - def dim_max(self, set, *args): - if set.plain_is_empty(): - raise LoopyError("domain '%s' is empty" % set) + return self.op(set_, _get_dim_max, args) - from loopy.isl_helpers import dim_max_with_elimination - return self.op(set, "dim_max", dim_max_with_elimination, args) + def eliminate_except(self, set_, *args): + return self.op(set_, _eliminate_except, args) - def base_index_and_length(self, set, iname, context=None, + def base_index_and_length(self, set_, iname, context=None, n_allowed_params_in_length=None): """ :arg n_allowed_params_in_length: Simplifies the 'length' argument so that only the first that many params - (in the domain of *set*) occur. + (in the domain of *set_*) occur. """ if not isinstance(iname, int): - iname_to_dim = set.space.get_var_dict() + iname_to_dim = set_.space.get_var_dict() idx = iname_to_dim[iname][1] else: idx = iname - lower_bound_pw_aff = self.dim_min(set, idx) - upper_bound_pw_aff = self.dim_max(set, idx) + lower_bound_pw_aff = self.dim_min(set_, idx) + upper_bound_pw_aff = self.dim_max(set_, idx) from loopy.diagnostic import StaticValueFindingError from loopy.isl_helpers import ( @@ -469,7 +477,7 @@ class DomainChanger: @iterate_over_kernels_if_given_program def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True, use_insn_id=False): - """Return a string in the `dot `_ language depicting + """Return a string in the `dot `_ language depicting dependencies among kernel instructions. """ @@ -495,8 +503,8 @@ def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True, for insn in kernel.instructions: if isinstance(insn, MultiAssignmentBase): - lhs = ', '.join(str(assignee) for assignee in insn.assignees) - op = "%s <- %s" % (lhs, insn.expression) + lhs = ", ".join(str(assignee) for assignee in insn.assignees) + op = f"{lhs} <- {insn.expression}" if len(op) > 200: op = op[:200] + "..." @@ -512,7 +520,7 @@ def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True, insn_label = op tooltip = insn.id - lines.append("\"%s\" [label=\"%s\",shape=\"box\",tooltip=\"%s\"];" + lines.append('"%s" [label="%s",shape="box",tooltip="%s"];' % ( insn.id, repr(insn_label)[1:-1], @@ -547,7 +555,7 @@ def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True, for insn_1 in dep_graph: for insn_2 in dep_graph.get(insn_1, set()): - lines.append("%s -> %s" % (insn_2, insn_1)) + lines.append(f"{insn_2} -> {insn_1}") if iname_cluster: from loopy.schedule import ( @@ -556,7 +564,7 @@ def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True, for sched_item in kernel.schedule: if isinstance(sched_item, EnterLoop): - lines.append("subgraph cluster_%s { label=\"%s\"" + lines.append('subgraph cluster_%s { label="%s"' % (sched_item.iname, sched_item.iname)) elif isinstance(sched_item, LeaveLoop): lines.append("}") @@ -567,7 +575,7 @@ def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True, else: raise LoopyError("schedule item not unterstood: %r" % sched_item) - return "digraph %s {\n%s\n}" % ( + return "digraph {} {{\n{}\n}}".format( kernel.name, "\n".join(lines) ) @@ -689,9 +697,9 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): # {{{ figure out automatic-axis inames from loopy.kernel.data import AutoLocalIndexTagBase - auto_axis_inames = set( + auto_axis_inames = { iname for iname in kernel.insn_inames(insn) - if kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase)) + if kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase)} # }}} @@ -730,7 +738,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): if stride is None: continue coeffs = CoefficientCollector()(iexpr_i) - for var, coeff in six.iteritems(coeffs): + for var, coeff in coeffs.items(): if (isinstance(var, Variable) and var.name in auto_axis_inames): # excludes '1', i.e. the constant @@ -742,7 +750,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): # }}} from pymbolic import evaluate - for iname, stride_expr in six.iteritems(iname_to_stride_expr): + for iname, stride_expr in iname_to_stride_expr.items(): stride = evaluate(stride_expr, approximate_arg_values) aggregate_strides[iname] = aggregate_strides.get(iname, 0) + stride @@ -954,7 +962,7 @@ def assign_automatic_axes(kernel, callables_table, axis=0, local_size=None): # {{{ array modifier -class ArrayChanger(object): +class ArrayChanger: def __init__(self, kernel, array_name): self.kernel = kernel self.array_name = array_name @@ -1036,8 +1044,8 @@ def guess_var_shape(kernel, var_name): % (var_name, ", ".join( str(i) for i in armap.bad_subscripts))) - n_axes_in_subscripts = set( - len(sub.index_tuple) for sub in armap.bad_subscripts) + n_axes_in_subscripts = { + len(sub.index_tuple) for sub in armap.bad_subscripts} if len(n_axes_in_subscripts) != 1: raise RuntimeError("subscripts of '%s' with differing " @@ -1088,7 +1096,7 @@ def guess_var_shape(kernel, var_name): # {{{ loop nest tracker -class SetTrie(object): +class SetTrie: """ Similar to a trie, but uses an unordered sequence as the key. """ @@ -1103,9 +1111,8 @@ class SetTrie(object): def descend(self, on_found=lambda prefix: None, prefix=frozenset()): on_found(prefix) - from six import iteritems for prefix, child in sorted( - iteritems(self.children), + self.children.items(), key=lambda it: sorted(it[0])): child.descend(on_found, prefix=prefix) @@ -1117,9 +1124,7 @@ class SetTrie(object): if len(key) == 0: return - from six import iteritems - - for child_key, child in iteritems(self.children): + for child_key, child in self.children.items(): common = child_key & key if common: break @@ -1178,16 +1183,16 @@ def get_visual_iname_order_embedding(kernel): iname_trie = SetTrie() for insn in kernel.instructions: - within_inames = set( + within_inames = { iname for iname in insn.within_inames - if iname not in ilp_inames) + if iname not in ilp_inames} iname_trie.add_or_update(within_inames) embedding = {} def update_embedding(inames): embedding.update( - dict((iname, (len(embedding), iname)) for iname in inames)) + {iname: (len(embedding), iname) for iname in inames}) iname_trie.descend(update_embedding) @@ -1288,8 +1293,8 @@ def draw_dependencies_as_unicode_arrows( def make_extender(): result = n_columns[0] * [" "] - for col, (_, pointed_at_insn_id) in six.iteritems(columns_in_use): - result[col] = do_flag_downward(u"│", pointed_at_insn_id) + for col, (_, pointed_at_insn_id) in columns_in_use.items(): + result[col] = do_flag_downward("│", pointed_at_insn_id) return result @@ -1321,28 +1326,28 @@ def draw_dependencies_as_unicode_arrows( # }}} - for col, (starts, pointed_at_insn_id) in list(six.iteritems(columns_in_use)): + for col, (starts, pointed_at_insn_id) in list(columns_in_use.items()): if insn.id == pointed_at_insn_id: if starts: # will continue downward - row[col] = do_flag_downward(u">", pointed_at_insn_id) + row[col] = do_flag_downward(">", pointed_at_insn_id) else: # stops here # placeholder, pending deletion columns_in_use[col] = None - row[col] = do_flag_downward(u"↳", pointed_at_insn_id) + row[col] = do_flag_downward("↳", pointed_at_insn_id) elif insn.id in starts: starts.remove(insn.id) if starts or pointed_at_insn_id not in processed_ids: # will continue downward - row[col] = do_flag_downward(u"├", pointed_at_insn_id) + row[col] = do_flag_downward("├", pointed_at_insn_id) else: # stops here - row[col] = u"└" + row[col] = "└" # placeholder, pending deletion columns_in_use[col] = None @@ -1352,7 +1357,7 @@ def draw_dependencies_as_unicode_arrows( if dep_key not in dep_to_column and rdeps: col = dep_to_column[dep_key] = find_free_column() columns_in_use[col] = (rdeps, insn.id) - row[col] = u"↱" + row[col] = "↱" # }}} @@ -1368,13 +1373,13 @@ def draw_dependencies_as_unicode_arrows( # we're currently handling it. columns_in_use[col] = (set(), dep) - row[col] = do_flag_downward(u"┌", dep) + row[col] = do_flag_downward("┌", dep) # }}} # {{{ delete columns_in_use entry for end-of-life columns - for col, value in list(six.iteritems(columns_in_use)): + for col, value in list(columns_in_use.items()): if value is None: del columns_in_use[col] @@ -1398,7 +1403,7 @@ def draw_dependencies_as_unicode_arrows( .replace(style.RESET_ALL, "")) return len(s) - def truncate_without_color_escapes(s, l): + def truncate_without_color_escapes(s, length): # FIXME: This is a bit dumb--it removes color escapes when truncation # is needed. @@ -1406,7 +1411,7 @@ def draw_dependencies_as_unicode_arrows( .replace(fore.RED, "") .replace(style.RESET_ALL, "")) - return s[:l] + u"…" + return s[:length] + "…" def conform_to_uniform_length(s): len_s = len_without_color_escapes(s) @@ -1445,6 +1450,8 @@ def stringify_instruction_list(kernel): def insert_insn_into_order(insn): if insn.id in printed_insn_ids: + # Note: dependency cycles are deliberately ignored so that printing + # succeeds. return printed_insn_ids.add(insn.id) @@ -1523,12 +1530,12 @@ def stringify_instruction_list(kernel): trailing = [] elif isinstance(insn, lp.CInstruction): lhs = ", ".join(str(a) for a in insn.assignees) - rhs = "CODE(%s|%s)" % ( + rhs = "CODE({}|{})".format( ", ".join(str(x) for x in insn.read_variables), - ", ".join("%s=%s" % (name, expr) + ", ".join(f"{name}={expr}" for name, expr in insn.iname_exprs)) - trailing = [l for l in insn.code.split("\n")] + trailing = insn.code.split("\n") elif isinstance(insn, lp.BarrierInstruction): lhs = "" rhs = "... %sbarrier" % insn.synchronization_kind[0] @@ -1562,11 +1569,11 @@ def stringify_instruction_list(kernel): options.append("no_sync_with=%s" % ":".join( "%s@%s" % entry for entry in sorted(insn.no_sync_with))) if isinstance(insn, lp.BarrierInstruction) and \ - insn.synchronization_kind == 'local': - options.append('mem_kind=%s' % insn.mem_kind) + insn.synchronization_kind == "local": + options.append("mem_kind=%s" % insn.mem_kind) if lhs: - core = "%s = %s" % ( + core = "{} = {}".format( Fore.CYAN+lhs+Style.RESET_ALL, Fore.MAGENTA+rhs+Style.RESET_ALL, ) @@ -1600,6 +1607,13 @@ def stringify_instruction_list(kernel): # {{{ global barrier order finding +def _is_global_barrier(kernel, insn_id): + insn = kernel.id_to_insn[insn_id] + from loopy.kernel.instruction import BarrierInstruction + return isinstance(insn, BarrierInstruction) and \ + insn.synchronization_kind == "global" + + @memoize_on_first_arg def get_global_barrier_order(kernel): """Return a :class:`tuple` of the listing the ids of global barrier instructions @@ -1607,49 +1621,27 @@ def get_global_barrier_order(kernel): See also :class:`loopy.instruction.BarrierInstruction`. """ - barriers = [] - visiting = set() - visited = set() - - unvisited = set(insn.id for insn in kernel.instructions) - - def is_barrier(my_insn_id): - insn = kernel.id_to_insn[my_insn_id] - from loopy.kernel.instruction import BarrierInstruction - return isinstance(insn, BarrierInstruction) and \ - insn.synchronization_kind == "global" - - while unvisited: - stack = [unvisited.pop()] - - while stack: - top = stack[-1] - - if top in visiting: - visiting.remove(top) - if is_barrier(top): - barriers.append(top) + dep_graph = {insn.id: set() for insn in kernel.instructions} + for insn in kernel.instructions: + for dep in insn.depends_on: + dep_graph[dep].add(insn.id) - if top in visited: - stack.pop() - continue + from pytools.graph import compute_topological_order + order = compute_topological_order(dep_graph) - visited.add(top) - visiting.add(top) + barriers = [ + insn_id for insn_id in order + if _is_global_barrier(kernel, insn_id)] - for child in kernel.id_to_insn[top].depends_on: - # Check for no cycles. - assert child not in visiting - stack.append(child) + del order # Ensure this is the only possible order. # # We do this by looking at the barriers in order. # We check for each adjacent pair (a,b) in the order if a < b, # i.e. if a is reachable by a chain of dependencies from b. - - visiting.clear() - visited.clear() + visited = set() + visiting = set() for prev_barrier, barrier in zip(barriers, barriers[1:]): # Check if prev_barrier is reachable from barrier. @@ -1707,22 +1699,16 @@ def find_most_recent_global_barrier(kernel, insn_id): if len(insn.depends_on) == 0: return None - def is_barrier(my_insn_id): - insn = kernel.id_to_insn[my_insn_id] - from loopy.kernel.instruction import BarrierInstruction - return isinstance(insn, BarrierInstruction) and \ - insn.synchronization_kind == "global" - - global_barrier_to_ordinal = dict( - (b, i) for i, b in enumerate(global_barrier_order)) + global_barrier_to_ordinal = { + b: i for i, b in enumerate(global_barrier_order)} def get_barrier_ordinal(barrier_id): return (global_barrier_to_ordinal[barrier_id] if barrier_id is not None else -1) - direct_barrier_dependencies = set( - dep for dep in insn.depends_on if is_barrier(dep)) + direct_barrier_dependencies = { + dep for dep in insn.depends_on if _is_global_barrier(kernel, dep)} if len(direct_barrier_dependencies) > 0: return max(direct_barrier_dependencies, key=get_barrier_ordinal) @@ -1744,8 +1730,8 @@ def get_subkernels(kernel): See also :class:`loopy.schedule.CallKernel`. """ from loopy.kernel import KernelState - if kernel.state != KernelState.SCHEDULED: - raise LoopyError("Kernel must be scheduled") + if kernel.state != KernelState.LINEARIZED: + raise LoopyError("Kernel must be linearized") from loopy.schedule import CallKernel @@ -1761,7 +1747,7 @@ def get_subkernel_to_insn_id_map(kernel): kernel must be scheduled. """ from loopy.kernel import KernelState - if kernel.state != KernelState.SCHEDULED: + if kernel.state != KernelState.LINEARIZED: raise LoopyError("Kernel must be scheduled") from loopy.schedule import ( @@ -1792,7 +1778,7 @@ def get_subkernel_to_insn_id_map(kernel): # {{{ find aliasing equivalence classes -class DisjointSets(object): +class DisjointSets: """ .. automethod:: __getitem__ .. automethod:: find_leader_or_create_group @@ -1814,7 +1800,7 @@ class DisjointSets(object): try: leader = self.element_to_leader[item] except KeyError: - return set([item]) + return {item} else: return self.leader_to_group[leader] @@ -1825,7 +1811,7 @@ class DisjointSets(object): pass self.element_to_leader[el] = el - self.leader_to_group[el] = set([el]) + self.leader_to_group[el] = {el} return el def union(self, a, b): @@ -1864,7 +1850,7 @@ class DisjointSets(object): def find_aliasing_equivalence_classes(kernel): return DisjointSets().union_many( (tv.base_storage, tv.name) - for tv in six.itervalues(kernel.temporary_variables) + for tv in kernel.temporary_variables.values() if tv.base_storage is not None) # }}} @@ -1991,8 +1977,8 @@ class CallCollector(CombineMapper): def map_call_with_kwargs(self, expr): return (frozenset([expr.function.name]) | - self.combine((self.rec(child) for child in expr.parameters - + tuple(expr.kw_parameters.values())))) + self.combine(self.rec(child) for child in expr.parameters + + tuple(expr.kw_parameters.values()))) def map_constant(self, expr): return frozenset() diff --git a/loopy/library/function.py b/loopy/library/function.py index 378b7de5897912e2e04314b066f40e5ea6b0c785..291f0c372bdac74a79f25da361bb381c5646ed58 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -1,5 +1,3 @@ -from __future__ import division - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -38,8 +36,8 @@ class MakeTupleCallable(ScalarCallable): def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, expr): from loopy.kernel.function_interface import ValueArgDescriptor - new_arg_id_to_descr = dict(((id, ValueArgDescriptor()), - (-id-1, ValueArgDescriptor())) for id in arg_id_to_descr.keys()) + new_arg_id_to_descr = {(id, ValueArgDescriptor()): + (-id-1, ValueArgDescriptor()) for id in arg_id_to_descr.keys()} return ( self.copy(arg_id_to_descr=new_arg_id_to_descr), @@ -48,8 +46,8 @@ class MakeTupleCallable(ScalarCallable): class IndexOfCallable(ScalarCallable): def with_types(self, arg_id_to_dtype, kernel, callables_table): - new_arg_id_to_dtype = dict((i, dtype) for i, dtype in - arg_id_to_dtype.items() if dtype is not None) + new_arg_id_to_dtype = {i: dtype for i, dtype in + arg_id_to_dtype.items() if dtype is not None} new_arg_id_to_dtype[-1] = kernel.index_dtype return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype), diff --git a/loopy/library/random123.py b/loopy/library/random123.py index e59a892bb4c7b3bd7222bf61b29e0ade92195240..6ec8affe35982c1412112fd07f93458cb6a63cde 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -1,6 +1,5 @@ """Library integration with Random123.""" -from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2016 Andreas Kloeckner" @@ -63,12 +62,12 @@ RNG_VARIANTS = [ _threefry_base_info.copy(width=4, bits=64), ] -FUNC_NAMES_TO_RNG = dict( - (v.full_name + suffix, v) +FUNC_NAMES_TO_RNG = { + v.full_name + suffix: v for v in RNG_VARIANTS for suffix in [ "", "_f32", "_f64", - ]) + ]} # }}} diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 5b78c08f4d3588123a6eaf1d6dccda239ef6fed7..f44d243230fb31264a7e2a588e6086b2173daa2a 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -33,8 +31,24 @@ from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.tools import update_persistent_hash +__doc__ = """ +.. currentmodule:: loopy.library.reduction + +.. autoclass:: ReductionOperation + +.. autoclass:: ScalarReductionOperation + +.. autoclass:: SumReductionOperation + +.. autoclass:: ProductReductionOperation + +.. autoclass:: MaxReductionOperation + +.. autoclass:: MinReductionOperation +""" + -class ReductionOperation(object): +class ReductionOperation: """Subclasses of this type have to be hashable, picklable, and equality-comparable. """ @@ -122,7 +136,7 @@ class ScalarReductionOperation(ReductionOperation): result = type(self).__name__.replace("ReductionOperation", "").lower() if self.forced_result_type is not None: - result = "%s<%s>" % (result, str(self.forced_result_type)) + result = "{}<{}>".format(result, str(self.forced_result_type)) return result @@ -154,11 +168,11 @@ def get_le_neutral(dtype): elif dtype.numpy_dtype.kind == "i": # OpenCL 1.1, section 6.11.3 if dtype.numpy_dtype.itemsize == 4: - #32 bit integer + # 32 bit integer return var("INT_MAX") elif dtype.numpy_dtype.itemsize == 8: - #64 bit integer - return var('LONG_MAX') + # 64 bit integer + return var("LONG_MAX") else: raise NotImplementedError("less") @@ -172,11 +186,11 @@ def get_ge_neutral(dtype): elif dtype.numpy_dtype.kind == "i": # OpenCL 1.1, section 6.11.3 if dtype.numpy_dtype.itemsize == 4: - #32 bit integer + # 32 bit integer return var("INT_MIN") elif dtype.numpy_dtype.itemsize == 8: - #64 bit integer - return var('LONG_MIN') + # 64 bit integer + return var("LONG_MIN") else: raise NotImplementedError("less") @@ -255,7 +269,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): return 2 def prefix(self, scalar_dtype, segment_flag_dtype): - return "loopy_segmented_%s_%s_%s" % (self.which, + return "loopy_segmented_{}_{}_{}".format(self.which, scalar_dtype.numpy_dtype.type.__name__, segment_flag_dtype.numpy_dtype.type.__name__) @@ -328,7 +342,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): raise NotImplementedError def prefix(self, scalar_dtype, index_dtype): - return "loopy_arg%s_%s_%s" % (self.which, + return "loopy_arg{}_{}_{}".format(self.which, scalar_dtype.numpy_dtype.type.__name__, index_dtype.numpy_dtype.type.__name__) @@ -406,7 +420,7 @@ _REDUCTION_OP_PARSERS = [ def register_reduction_parser(parser): - """Register a new :class:`ReductionOperation`. + """Register a new :class:`loopy.library.reduction.ReductionOperation`. :arg parser: A function that receives a string and returns a subclass of ReductionOperation. @@ -472,28 +486,28 @@ class ReductionCallable(ScalarCallable): prefix = op.prefix(scalar_dtype, index_dtype) yield (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(index_t)s index1, - %(scalar_t)s op2, %(index_t)s index2, - %(index_t)s *index_out) - { - if (op2 %(comp)s op1) - { + inline {scalar_t} {prefix}_op( + {scalar_t} op1, {index_t} index1, + {scalar_t} op2, {index_t} index2, + {index_t} *index_out) + {{ + if (op2 {comp} op1) + {{ *index_out = index2; return op2; - } + }} else - { + {{ *index_out = index1; return op1; - } - } - """ % dict( - scalar_t=target.dtype_to_typename(scalar_dtype), - prefix=prefix, - index_t=target.dtype_to_typename(index_dtype), - comp=op.update_comparison, - )) + }} + }} + """.format( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + index_t=target.dtype_to_typename(index_dtype), + comp=op.update_comparison, + )) elif isinstance(self.name, SegmentedOp): op = self.name.reduction_op scalar_dtype = self.arg_id_to_dtype[-1] @@ -501,20 +515,20 @@ class ReductionCallable(ScalarCallable): prefix = op.prefix(scalar_dtype, segment_flag_dtype) yield (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, - %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, - %(segment_flag_t)s *segment_flag_out) - { + inline {scalar_t} {prefix}_op( + {scalar_t} op1, {segment_flag_t} segment_flag1, + {scalar_t} op2, {segment_flag_t} segment_flag2, + {segment_flag_t} *segment_flag_out) + {{ *segment_flag_out = segment_flag1 | segment_flag2; - return segment_flag2 ? op2 : %(combined)s; - } - """ % dict( - scalar_t=target.dtype_to_typename(scalar_dtype), - prefix=prefix, - segment_flag_t=target.dtype_to_typename(segment_flag_dtype), - combined=op.op % ("op1", "op2"), - )) + return segment_flag2 ? op2 : {combined}; + }} + """.format( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + segment_flag_t=target.dtype_to_typename(segment_flag_dtype), + combined=op.op % ("op1", "op2"), + )) return diff --git a/loopy/loop.py b/loopy/loop.py index 24cbe730f7679ba9b9931f7493d3c793ce3718c9..73ca8d72824071b36bf91798ba9a1ea14e624db7 100644 --- a/loopy/loop.py +++ b/loopy/loop.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -24,7 +22,6 @@ THE SOFTWARE. import islpy as isl -import six from loopy.program import iterate_over_kernels_if_given_program @@ -71,7 +68,7 @@ def merge_loop_domains(kernel): new_domains = None - for inner_iname, outer_inames in six.iteritems(lnm): + for inner_iname, outer_inames in lnm.items(): for outer_iname in outer_inames: # {{{ check if it's safe to merge diff --git a/loopy/match.py b/loopy/match.py index 9766fac2b57f5cb55eebb09f4ab32880ef3c2038..f13d56053c7e87333192cc3980a26fc2c18f7a51 100644 --- a/loopy/match.py +++ b/loopy/match.py @@ -1,7 +1,6 @@ """Matching functionality for instruction ids and subsitution rule invocations stacks.""" -from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -25,7 +24,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from six.moves import range, intern +from sys import intern NoneType = type(None) @@ -116,7 +115,7 @@ _PREC_NOT = 30 # {{{ match expression -class MatchExpressionBase(object): +class MatchExpressionBase: def __call__(self, kernel, matchable): raise NotImplementedError @@ -162,7 +161,7 @@ class MultiChildMatchExpressionBase(MatchExpressionBase): return "(%s)" % (joiner.join(str(ch) for ch in self.children)) def __repr__(self): - return "%s(%s)" % ( + return "{}({})".format( type(self).__name__, ", ".join(repr(ch) for ch in self.children)) @@ -199,7 +198,7 @@ class Not(MatchExpressionBase): return "(not %s)" % str(self.child) def __repr__(self): - return "%s(%r)" % (type(self).__name__, self.child) + return "{}({!r})".format(type(self).__name__, self.child) def update_persistent_hash(self, key_hash, key_builder): key_builder.rec(key_hash, "not_match_expr") @@ -226,7 +225,7 @@ class GlobMatchExpressionBase(MatchExpressionBase): return descr.lower() + ":" + self.glob def __repr__(self): - return "%s(%r)" % (type(self).__name__, self. glob) + return "{}({!r})".format(type(self).__name__, self. glob) def update_persistent_hash(self, key_hash, key_builder): key_builder.rec(key_hash, type(self).__name__) @@ -386,7 +385,7 @@ def parse_match(expr): # {{{ stack match objects -class StackMatchComponent(object): +class StackMatchComponent: def __ne__(self, other): return not self.__eq__(other) @@ -455,7 +454,7 @@ class StackWildcardMatchComponent(StackMatchComponent): # {{{ stack matcher -class RuleInvocationMatchable(object): +class RuleInvocationMatchable: def __init__(self, id, tags): self.id = id self.tags = tags @@ -470,7 +469,7 @@ class RuleInvocationMatchable(object): raise TypeError("inames: query may not be applied to rule invocations") -class StackMatch(object): +class StackMatch: def __init__(self, root_component): self.root_component = root_component diff --git a/loopy/maxima.py b/loopy/maxima.py deleted file mode 100644 index c74360a731fa06644065e743fb9397ea170fb7f3..0000000000000000000000000000000000000000 --- a/loopy/maxima.py +++ /dev/null @@ -1,105 +0,0 @@ -# pylint: disable=all # This code needs porting to modern loopy -"""Export to maxima.""" - -from __future__ import division - -__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" - -__license__ = """ -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -""" - - -from pymbolic.interop.maxima import \ - MaximaStringifyMapper as MaximaStringifyMapperBase - - -class MaximaStringifyMapper(MaximaStringifyMapperBase): - def map_subscript(self, expr, enclosing_prec): - res = self.rec(expr.aggregate, enclosing_prec) - idx = expr.index - if not isinstance(idx, tuple): - idx = (idx,) - for i in idx: - if isinstance(i, int): - res += "_%d" % i - - return res - - -def get_loopy_instructions_as_maxima(kernel, prefix): - """Sample use for code comparison:: - - load("knl-optFalse.mac"); - load("knl-optTrue.mac"); - - vname: bessel_j_8; - - un_name : concat(''un_, vname); - opt_name : concat(''opt_, vname); - - print(ratsimp(ev(un_name - opt_name))); - """ - from loopy.preprocess import add_boostability_and_automatic_dependencies - kernel = add_boostability_and_automatic_dependencies(kernel) - - my_variable_names = ( - avn - for insn in kernel.instructions - for avn in insn.assignee_var_names() - ) - - from pymbolic import var - subst_dict = dict( - (vn, var(prefix+vn)) for vn in my_variable_names) - - mstr = MaximaStringifyMapper() - from loopy.symbolic import SubstitutionMapper - from pymbolic.mapper.substitutor import make_subst_func - substitute = SubstitutionMapper(make_subst_func(subst_dict)) - - result = ["ratprint:false;"] - - written_insn_ids = set() - - from loopy.kernel import InstructionBase, Assignment - - def write_insn(insn): - if not isinstance(insn, InstructionBase): - insn = kernel.id_to_insn[insn] - if not isinstance(insn, Assignment): - raise RuntimeError("non-single-output assignment not supported " - "in maxima export") - - for dep in insn.depends_on: - if dep not in written_insn_ids: - write_insn(dep) - - aname, = insn.assignee_var_names() - result.append("%s%s : %s;" % ( - prefix, aname, - mstr(substitute(insn.expression)))) - - written_insn_ids.add(insn.id) - - for insn in kernel.instructions: - if insn.id not in written_insn_ids: - write_insn(insn) - - return "\n".join(result) diff --git a/loopy/options.py b/loopy/options.py index 63089d94d3487e77a1def39a98fe24631c508398..2dc8f22cd8a205da89d86b5157af8792a37111ed 100644 --- a/loopy/options.py +++ b/loopy/options.py @@ -1,5 +1,3 @@ -from __future__ import division, with_statement - __copyright__ = "Copyright (C) 2013 Andreas Kloeckner" __license__ = """ @@ -23,7 +21,6 @@ THE SOFTWARE. """ -import six from pytools import ImmutableRecord import re @@ -31,7 +28,7 @@ import re ALLOW_TERMINAL_COLORS = True -class _ColoramaStub(object): +class _ColoramaStub: def __getattribute__(self, name): return "" @@ -39,7 +36,7 @@ class _ColoramaStub(object): def _apply_legacy_map(lmap, kwargs): result = {} - for name, val in six.iteritems(kwargs): + for name, val in kwargs.items(): try: lmap_value = lmap[name] except KeyError: @@ -89,12 +86,6 @@ class Options(ImmutableRecord): Like :attr:`trace_assignments`, but also trace the assigned values. - .. attribute:: ignore_boostable_into - - Ignore the boostable_into field of the kernel, when - determining whether an iname duplication is necessary - for the kernel to be schedulable. - .. attribute:: check_dep_resolution Whether loopy should issue an error if a dependency @@ -117,7 +108,7 @@ class Options(ImmutableRecord): .. attribute:: cl_exec_manage_array_events Within the PyOpenCL executor, respect and udpate - :attr:`pyopencl.array.Array.event`. + :attr:`pyopencl.array.Array.events`. Defaults to *True*. @@ -146,7 +137,7 @@ class Options(ImmutableRecord): .. attribute:: edit_code Invoke an editor (given by the environment variable - :envvar:`EDITOR`) on the generated kernel code, + ``EDITOR``) on the generated kernel code, allowing for tweaks before the code is passed on to the target for compilation. @@ -211,7 +202,6 @@ class Options(ImmutableRecord): annotate_inames=kwargs.get("annotate_inames", False), trace_assignments=kwargs.get("trace_assignments", False), trace_assignment_values=kwargs.get("trace_assignment_values", False), - ignore_boostable_into=kwargs.get("ignore_boostable_into", False), skip_arg_checks=kwargs.get("skip_arg_checks", False), no_numpy=kwargs.get("no_numpy", False), @@ -228,7 +218,7 @@ class Options(ImmutableRecord): check_dep_resolution=kwargs.get("check_dep_resolution", True), enforce_variable_access_ordered=kwargs.get( - "enforce_variable_access_ordered", False), + "enforce_variable_access_ordered", True), ) # {{{ legacy compatibility diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 504b361fb001f6683d6ee8837d7af1c3b51d83ef..0d55d5c92bd9c43219e2d57c3a20ac8248856dfb 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -25,7 +23,6 @@ THE SOFTWARE. import logging logger = logging.getLogger(__name__) -import six from loopy.diagnostic import ( LoopyError, WriteRaceConditionWarning, warn_with_kernel, LoopyAdvisory) @@ -39,6 +36,7 @@ from loopy.kernel.data import make_assignment, filter_iname_tags_by_type # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types from loopy.symbolic import RuleAwareIdentityMapper +from loopy.transform.iname import remove_any_newly_unused_inames from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, CallInstruction, _DataObliviousInstruction) @@ -69,7 +67,7 @@ def prepare_for_caching(kernel): new_args.append(arg) new_temporary_variables = {} - for name, temp in six.iteritems(kernel.temporary_variables): + for name, temp in kernel.temporary_variables.items(): dtype = temp.dtype if dtype is not None and dtype is not lp.auto and dtype.target is not tgt: temp = temp.copy(dtype=dtype.with_target(tgt), target=tgt) @@ -127,7 +125,7 @@ def check_reduction_iname_uniqueness(kernel): for insn in kernel.instructions: insn.with_transformed_expressions(cb_mapper) - for iname, count in six.iteritems(iname_to_reduction_count): + for iname, count in iname_to_reduction_count.items(): nonsimul_count = iname_to_nonsimultaneous_reduction_count.get(iname, 0) if nonsimul_count and count > 1: @@ -146,18 +144,18 @@ def check_reduction_iname_uniqueness(kernel): # {{{ decide temporary address space def _get_compute_inames_tagged(kernel, insn, tag_base): - return set(iname for iname in kernel.insn_inames(insn.id) - if kernel.iname_tags_of_type(iname, tag_base)) + return {iname for iname in kernel.insn_inames(insn.id) + if kernel.iname_tags_of_type(iname, tag_base)} def _get_assignee_inames_tagged(kernel, insn, tag_base, tv_names): - return set(iname + return {iname for aname, adeps in zip( insn.assignee_var_names(), insn.assignee_subscript_deps()) for iname in adeps & kernel.all_inames() if aname in tv_names - if kernel.iname_tags_of_type(iname, tag_base)) + if kernel.iname_tags_of_type(iname, tag_base)} def find_temporary_address_space(kernel): @@ -174,7 +172,7 @@ def find_temporary_address_space(kernel): kernel_var_names = kernel.all_variable_names(include_temp_storage=False) - for temp_var in six.itervalues(kernel.temporary_variables): + for temp_var in kernel.temporary_variables.values(): if temp_var.base_storage is not None: # no nesting allowed if temp_var.base_storage in kernel_var_names: @@ -185,7 +183,7 @@ def find_temporary_address_space(kernel): base_storage_to_aliases.setdefault( temp_var.base_storage, []).append(temp_var.name) - for temp_var in six.itervalues(kernel.temporary_variables): + for temp_var in kernel.temporary_variables.values(): # Only fill out for variables that do not yet know if they're # local. (I.e. those generated by implicit temporary generation.) @@ -237,7 +235,7 @@ def find_temporary_address_space(kernel): if (apin != cpin and bool(apin)): warn_with_kernel( kernel, - "write_race_%s(%s)" % (aspace_descr, insn_id), + f"write_race_{aspace_descr}({insn_id})", "instruction '%s' looks invalid: " "it assigns to indices based on %s IDs, but " "its temporary '%s' cannot be made %s because " @@ -452,7 +450,7 @@ def _try_infer_scan_candidate_from_expr( if len(expr.inames) != 1: raise ValueError( - "Multiple inames in reduction: '%s'" % (", ".join(expr.inames),)) + "Multiple inames in reduction: '{}'".format(", ".join(expr.inames))) scan_iname, = expr.inames @@ -501,9 +499,9 @@ def _try_infer_sweep_iname(domain, scan_iname, candidate_inames): sweep_iname_candidate = None for constr in constrs: - candidate_vars = set([ + candidate_vars = { var for var in constr.get_var_dict() - if var in candidate_inames]) + if var in candidate_inames} # Irrelevant constraint - skip if scan_iname not in candidate_vars: @@ -720,13 +718,12 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): new_or_updated_instructions = {} new_temporaries = {} - dep_map = dict( - (insn.id, insn.depends_on) for insn in kernel.instructions) + dep_map = { + insn.id: insn.depends_on for insn in kernel.instructions} - inverse_dep_map = dict((insn.id, set()) for insn in kernel.instructions) + inverse_dep_map = {insn.id: set() for insn in kernel.instructions} - import six - for insn_id, deps in six.iteritems(dep_map): + for insn_id, deps in dep_map.items(): for dep in deps: inverse_dep_map[dep].add(insn_id) @@ -892,6 +889,7 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): # }}} +@remove_any_newly_unused_inames def realize_reduction_for_single_kernel(kernel, callables_table, insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False, force_scan=False, force_outer_iname_for_scan=None): @@ -952,7 +950,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, # items that are not "plain" tuples here. if not isinstance(inner_expr, tuple): get_args_insn_id = insn_id_gen( - "%s_%s_get" % (insn.id, "_".join(expr.inames))) + "{}_{}_get".format(insn.id, "_".join(expr.inames))) inner_expr = expand_inner_reduction( id=get_args_insn_id, @@ -1037,7 +1035,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, acc_vars = tuple(var(n) for n in acc_var_names) init_id = insn_id_gen( - "%s_%s_init" % (insn.id, "_".join(expr.inames))) + "{}_{}_init".format(insn.id, "_".join(expr.inames))) init_insn = make_assignment( id=init_id, @@ -1051,20 +1049,20 @@ def realize_reduction_for_single_kernel(kernel, callables_table, generated_insns.append(init_insn) update_id = insn_id_gen( - based_on="%s_%s_update" % (insn.id, "_".join(expr.inames))) + based_on="{}_{}_update".format(insn.id, "_".join(expr.inames))) update_insn_iname_deps = temp_kernel.insn_inames(insn) | set(expr.inames) if insn.within_inames_is_final: update_insn_iname_deps = insn.within_inames | set(expr.inames) - reduction_insn_depends_on = set([init_id]) + reduction_insn_depends_on = {init_id} # In the case of a multi-argument reduction, we need a name for each of # the arguments in order to pass them to the binary op - so we expand # items that are not "plain" tuples here. if nresults > 1 and not isinstance(expr.expr, tuple): get_args_insn_id = insn_id_gen( - "%s_%s_get" % (insn.id, "_".join(expr.inames))) + "{}_{}_get".format(insn.id, "_".join(expr.inames))) reduction_expr = expand_inner_reduction( id=get_args_insn_id, @@ -1113,7 +1111,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, static_max_of_pw_aff( kernel.get_iname_bounds(iname).size, constants_only=True)) - assert isinstance(size, six.integer_types) + assert isinstance(size, int) return size def _make_slab_set(iname, size): @@ -1184,7 +1182,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, base_iname_deps = outer_insn_inames - frozenset(expr.inames) neutral = expr.operation.neutral_element(*arg_dtypes) - init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname)) + init_id = insn_id_gen(f"{insn.id}_{red_iname}_init") init_insn = make_assignment( id=init_id, assignees=tuple( @@ -1198,7 +1196,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, ) generated_insns.append(init_insn) - init_neutral_id = insn_id_gen("%s_%s_init_neutral" % (insn.id, red_iname)) + init_neutral_id = insn_id_gen(f"{insn.id}_{red_iname}_init_neutral") init_neutral_insn = make_assignment( id=init_neutral_id, assignees=tuple(var(nvn) for nvn in neutral_var_names), @@ -1210,14 +1208,14 @@ def realize_reduction_for_single_kernel(kernel, callables_table, ) generated_insns.append(init_neutral_insn) - transfer_depends_on = set([init_neutral_id, init_id]) + transfer_depends_on = {init_neutral_id, init_id} # In the case of a multi-argument reduction, we need a name for each of # the arguments in order to pass them to the binary op - so we expand # items that are not "plain" tuples here. if nresults > 1 and not isinstance(expr.expr, tuple): get_args_insn_id = insn_id_gen( - "%s_%s_get" % (insn.id, red_iname)) + f"{insn.id}_{red_iname}_get") reduction_expr = expand_inner_reduction( id=get_args_insn_id, @@ -1235,7 +1233,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, else: reduction_expr = expr.expr - transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, red_iname)) + transfer_id = insn_id_gen(f"{insn.id}_{red_iname}_transfer") transfer_insn = make_assignment( id=transfer_id, assignees=tuple( @@ -1380,7 +1378,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, track_iname = var_name_gen( "{sweep_iname}__seq_scan" - .format(scan_iname=scan_iname, sweep_iname=sweep_iname)) + .format(sweep_iname=sweep_iname)) get_or_add_sweep_tracking_iname_and_domain( scan_iname, sweep_iname, sweep_min_value, scan_min_value, @@ -1398,7 +1396,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, acc_vars = tuple(var(n) for n in acc_var_names) init_id = insn_id_gen( - "%s_%s_init" % (insn.id, "_".join(expr.inames))) + "{}_{}_init".format(insn.id, "_".join(expr.inames))) init_insn_depends_on = frozenset() @@ -1420,18 +1418,18 @@ def realize_reduction_for_single_kernel(kernel, callables_table, generated_insns.append(init_insn) - update_insn_depends_on = set([init_insn.id]) | insn.depends_on + update_insn_depends_on = {init_insn.id} | insn.depends_on updated_inner_exprs = ( preprocess_scan_arguments(insn, expr.expr, nresults, scan_iname, track_iname, update_insn_depends_on)) update_id = insn_id_gen( - based_on="%s_%s_update" % (insn.id, "_".join(expr.inames))) + based_on="{}_{}_update".format(insn.id, "_".join(expr.inames))) - update_insn_iname_deps = temp_kernel.insn_inames(insn) | set([track_iname]) + update_insn_iname_deps = temp_kernel.insn_inames(insn) | {track_iname} if insn.within_inames_is_final: - update_insn_iname_deps = insn.within_inames | set([track_iname]) + update_insn_iname_deps = insn.within_inames | {track_iname} scan_insn = make_assignment( id=update_id, @@ -1490,7 +1488,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, track_iname = var_name_gen( "{sweep_iname}__pre_scan" - .format(scan_iname=scan_iname, sweep_iname=sweep_iname)) + .format(sweep_iname=sweep_iname)) get_or_add_sweep_tracking_iname_and_domain( scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride, @@ -1538,7 +1536,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, if global_barrier is not None: init_insn_depends_on |= frozenset([global_barrier]) - init_id = insn_id_gen("%s_%s_init" % (insn.id, scan_iname)) + init_id = insn_id_gen(f"{insn.id}_{scan_iname}_init") init_insn = make_assignment( id=init_id, assignees=tuple( @@ -1552,7 +1550,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, ) generated_insns.append(init_insn) - transfer_insn_depends_on = set([init_insn.id]) | insn.depends_on + transfer_insn_depends_on = {init_insn.id} | insn.depends_on updated_inner_exprs = ( preprocess_scan_arguments(insn, expr.expr, nresults, @@ -1563,7 +1561,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, from loopy.symbolic import pw_aff_to_expr sweep_min_value_expr = pw_aff_to_expr(sweep_min_value) - transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, scan_iname)) + transfer_id = insn_id_gen(f"{insn.id}_{scan_iname}_transfer") transfer_insn = make_assignment( id=transfer_id, assignees=tuple( @@ -1942,8 +1940,6 @@ def realize_reduction_for_single_kernel(kernel, callables_table, from loopy.transform.iname import tag_inames kernel = tag_inames(kernel, new_iname_tags) - # TODO: remove unused inames... - kernel = ( _hackily_ensure_multi_assignment_return_values_are_scoped_private( kernel)) @@ -1987,7 +1983,7 @@ def realize_ilp(kernel): filter_iname_tags_by_type) privatizing_inames = frozenset( - iname for iname, tags in six.iteritems(kernel.iname_to_tags) + iname for iname, tags in kernel.iname_to_tags.items() if filter_iname_tags_by_type(tags, (IlpBaseTag, VectorizeTag)) ) @@ -1997,114 +1993,6 @@ def realize_ilp(kernel): # }}} -# {{{ find idempotence ("boostability") of instructions - -def find_idempotence(kernel): - logger.debug("%s: idempotence" % kernel.name) - - writer_map = kernel.writer_map() - - arg_names = set(arg.name for arg in kernel.args) - - var_names = arg_names | set(six.iterkeys(kernel.temporary_variables)) - - reads_map = dict( - (insn.id, insn.read_dependency_names() & var_names) - for insn in kernel.instructions) - - from collections import defaultdict - dep_graph = defaultdict(set) - - for insn in kernel.instructions: - dep_graph[insn.id] = set(writer_id - for var in reads_map[insn.id] - for writer_id in writer_map.get(var, set())) - - # Find SCCs of dep_graph. These are used for checking if the instruction is - # in a dependency cycle. - from loopy.tools import compute_sccs - - sccs = dict((item, scc) - for scc in compute_sccs(dep_graph) - for item in scc) - - non_idempotently_updated_vars = set() - - new_insns = [] - for insn in kernel.instructions: - boostable = len(sccs[insn.id]) == 1 and insn.id not in dep_graph[insn.id] - - if not boostable: - non_idempotently_updated_vars.update( - insn.assignee_var_names()) - - new_insns.append(insn.copy(boostable=boostable)) - - # {{{ remove boostability from isns that access non-idempotently updated vars - - new2_insns = [] - for insn in new_insns: - if insn.boostable and bool( - non_idempotently_updated_vars & insn.dependency_names()): - new2_insns.append(insn.copy(boostable=False)) - else: - new2_insns.append(insn) - - # }}} - - return kernel.copy(instructions=new2_insns) - -# }}} - - -# {{{ limit boostability - -def limit_boostability(kernel): - """Finds out which other inames an instruction's inames occur with - and then limits boostability to just those inames. - """ - - logger.debug("%s: limit boostability" % kernel.name) - - iname_occurs_with = {} - for insn in kernel.instructions: - insn_inames = kernel.insn_inames(insn) - for iname in insn_inames: - iname_occurs_with.setdefault(iname, set()).update(insn_inames) - - iname_use_counts = {} - for insn in kernel.instructions: - for iname in kernel.insn_inames(insn): - iname_use_counts[iname] = iname_use_counts.get(iname, 0) + 1 - - single_use_inames = set(iname for iname, uc in six.iteritems(iname_use_counts) - if uc == 1) - - new_insns = [] - for insn in kernel.instructions: - if insn.boostable is None: - raise LoopyError("insn '%s' has undetermined boostability" % insn.id) - elif insn.boostable: - boostable_into = set() - for iname in kernel.insn_inames(insn): - boostable_into.update(iname_occurs_with[iname]) - - boostable_into -= kernel.insn_inames(insn) | single_use_inames - - # Even if boostable_into is empty, leave boostable flag on--it is used - # for boosting into unused hw axes. - - insn = insn.copy(boostable_into=boostable_into) - else: - insn = insn.copy(boostable_into=set()) - - new_insns.append(insn) - - return kernel.copy(instructions=new_insns) - -# }}} - - # {{{ check for loads of atomic variables def check_atomic_loads(kernel): @@ -2119,25 +2007,25 @@ def check_atomic_loads(kernel): # find atomic variables atomicity_candidates = ( - set(v.name for v in six.itervalues(kernel.temporary_variables) - if isinstance(v.dtype, AtomicType)) + {v.name for v in kernel.temporary_variables.values() + if isinstance(v.dtype, AtomicType)} | - set(v.name for v in kernel.args + {v.name for v in kernel.args if isinstance(v, ArrayBase) - and isinstance(v.dtype, AtomicType))) + and isinstance(v.dtype, AtomicType)}) new_insns = [] for insn in kernel.instructions: if isinstance(insn, Assignment): # look for atomic variables - atomic_accesses = set(a.var_name for a in insn.atomicity) + atomic_accesses = {a.var_name for a in insn.atomicity} accessed_atomic_vars = (insn.dependency_names() & atomicity_candidates)\ - - set([insn.assignee_var_names()[0]]) + - {insn.assignee_var_names()[0]} if not accessed_atomic_vars <= atomic_accesses: #if we're missing some missed = accessed_atomic_vars - atomic_accesses for x in missed: - if set([x]) & atomicity_candidates: + if {x} & atomicity_candidates: insn = insn.copy( atomicity=insn.atomicity + (AtomicLoad(x),)) @@ -2157,7 +2045,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, caller_kernel, callables_table): - super(ArgDescrInferenceMapper, self).__init__( + super().__init__( rule_mapping_context) self.caller_kernel = caller_kernel self.callables_table = callables_table @@ -2168,23 +2056,23 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): if not isinstance(expr.function, ResolvedFunction): # ignore if the call is not to a ResolvedFunction - return super(ArgDescrInferenceMapper, self).map_call(expr, expn_state) + return super().map_call(expr, expn_state) arg_id_to_val = dict(enumerate(expr.parameters)) if isinstance(expr, CallWithKwargs): arg_id_to_val.update(expr.kw_parameters) - if 'assignees' in kwargs: + if "assignees" in kwargs: # If supplied with assignees then this is a CallInstruction - assignees = kwargs['assignees'] + assignees = kwargs["assignees"] for i, arg in enumerate(assignees): arg_id_to_val[-i-1] = arg from loopy.kernel.function_interface import get_arg_descriptor_for_expression - arg_id_to_descr = dict( - (arg_id, get_arg_descriptor_for_expression( - self.caller_kernel, arg)) - for arg_id, arg in six.iteritems(arg_id_to_val)) + arg_id_to_descr = { + arg_id: get_arg_descriptor_for_expression( + self.caller_kernel, arg) + for arg_id, arg in arg_id_to_val.items()} # specializing the function according to the parameter description in_knl_callable = self.callables_table[expr.function.name] @@ -2210,9 +2098,9 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): ResolvedFunction(new_func_id), tuple(self.rec(child, expn_state) for child in expr.parameters), - dict( - (key, self.rec(val, expn_state)) - for key, val in six.iteritems(expr.kw_parameters)) + { + key: self.rec(val, expn_state) + for key, val in expr.kw_parameters.items()} ) map_call_with_kwargs = map_call @@ -2323,7 +2211,7 @@ def preprocess_single_kernel(kernel, callables_table, device=None): # {{{ check that there are no l.auto-tagged inames from loopy.kernel.data import AutoLocalIndexTagBase - for iname, tags in six.iteritems(kernel.iname_to_tags): + for iname, tags in kernel.iname_to_tags.items(): if (filter_iname_tags_by_type(tags, AutoLocalIndexTagBase) and iname in kernel.all_inames()): raise LoopyError("kernel with automatically-assigned " @@ -2363,10 +2251,6 @@ def preprocess_single_kernel(kernel, callables_table, device=None): kernel = find_temporary_address_space(kernel) - # boostability should be removed in 2017.x. - kernel = find_idempotence(kernel) - kernel = limit_boostability(kernel) - # check for atomic loads, much easier to do here now that the dependencies # have been established kernel = check_atomic_loads(kernel) diff --git a/loopy/program.py b/loopy/program.py index f862144037aee21e113ad2ccd43b79ceefd39b55..a8bdf91a2a570493e82d25b31784a21dea40801c 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" __license__ = """ @@ -22,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six import re from pytools import ImmutableRecord, memoize_method @@ -76,7 +73,7 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): """ def __init__(self, rule_mapping_context, kernel, callables_table, function_id_to_in_knl_callable_mappers): - super(ResolvedFunctionMarker, self).__init__(rule_mapping_context) + super().__init__(rule_mapping_context) self.kernel = kernel self.callables_table = callables_table self.function_id_to_in_knl_callable_mappers = ( @@ -131,13 +128,13 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): ResolvedFunction(new_func_id), tuple(self.rec(child, expn_state) for child in expr.parameters), - dict( - (key, self.rec(val, expn_state)) - for key, val in six.iteritems(expr.kw_parameters)) + { + key: self.rec(val, expn_state) + for key, val in expr.kw_parameters.items()} ) # this is an unknown function as of yet, do not modify it - return super(ResolvedFunctionMarker, self).map_call_with_kwargs(expr, + return super().map_call_with_kwargs(expr, expn_state) def map_reduction(self, expr, expn_state): @@ -148,7 +145,7 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): self.callables_table, _ = ( self.callables_table.with_added_callable(func_id, in_knl_callable)) - return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state) + return super().map_reduction(expr, expn_state) def _default_func_id_to_kernel_callable_mappers(target): @@ -243,7 +240,7 @@ class Program(ImmutableRecord): assert name in callables_table - super(Program, self).__init__( + super().__init__( name=name, callables_table=callables_table, target=target, @@ -260,10 +257,10 @@ class Program(ImmutableRecord): update_persistent_hash = update_persistent_hash def copy(self, **kwargs): - if 'target' in kwargs: + if "target" in kwargs: # target attribute of all the callable kernels should be updated. - target = kwargs['target'] - new_self = super(Program, self).copy(**kwargs) + target = kwargs["target"] + new_self = super().copy(**kwargs) new_resolved_functions = {} for func_id, in_knl_callable in ( new_self.callables_table.items()): @@ -280,7 +277,7 @@ class Program(ImmutableRecord): return super(Program, new_self).copy( callables_table=callables_table) else: - return super(Program, self).copy(**kwargs) + return super().copy(**kwargs) def get_grid_size_upper_bounds(self, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that @@ -371,7 +368,7 @@ class Program(ImmutableRecord): resolved_functions=new_resolved_functions)) def __iter__(self): - return six.iterkeys(self.callables_table.resolved_functions) + return self.callables_table.resolved_functions.keys() def __getitem__(self, name): result = self.callables_table[name] @@ -432,13 +429,13 @@ def next_indexed_function_identifier(function_id): match = func_name.match(function_id) if match is None: - if function_id[-1] == '_': - return "{old_name}0".format(old_name=function_id) + if function_id[-1] == "_": + return f"{function_id}0" else: - return "{old_name}_0".format(old_name=function_id) + return f"{function_id}_0" - return "{alpha}_{num}".format(alpha=match.group('alpha'), - num=int(match.group('num'))+1) + return "{alpha}_{num}".format(alpha=match.group("alpha"), + num=int(match.group("num"))+1) class ResolvedFunctionRenamer(RuleAwareIdentityMapper): @@ -447,7 +444,7 @@ class ResolvedFunctionRenamer(RuleAwareIdentityMapper): *renaming_dict*. """ def __init__(self, rule_mapping_context, renaming_dict): - super(ResolvedFunctionRenamer, self).__init__( + super().__init__( rule_mapping_context) self.renaming_dict = renaming_dict @@ -455,7 +452,7 @@ class ResolvedFunctionRenamer(RuleAwareIdentityMapper): if expr.name in self.renaming_dict: return ResolvedFunction(self.renaming_dict[expr.name]) else: - return super(ResolvedFunctionRenamer, self).map_resolved_function( + return super().map_resolved_function( expr, expn_state) @@ -504,8 +501,8 @@ class CallablesCountingMapper(CombineMapper): in_knl_callable = self.callables_table[expr.function.name] if isinstance(in_knl_callable, ScalarCallable): return (Counter([expr.function.name]) + - self.combine((self.rec(child) for child in expr.parameters - + tuple(kw_parameters.values())))) + self.combine(self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values()))) elif isinstance(in_knl_callable, CallableKernel): @@ -516,22 +513,22 @@ class CallablesCountingMapper(CombineMapper): self.callables_table)) return (Counter([expr.function.name]) + - self.combine((self.rec(child) for child in expr.parameters - + tuple(kw_parameters.values())))) + ( + self.combine(self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values()))) + ( callables_count_in_subkernel) else: raise NotImplementedError("Unknown callable type %s." % ( type)) else: return ( - self.combine((self.rec(child) for child in expr.parameters - + tuple(kw_parameters.values())))) + self.combine(self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values()))) map_call_with_kwargs = map_call def map_reduction(self, expr): return Counter(expr.operation.get_scalar_callables()) + ( - super(CallablesCountingMapper, self).map_reduction(expr)) + super().map_reduction(expr)) def map_constant(self, expr): return Counter() @@ -609,10 +606,10 @@ class CallablesTable(ImmutableRecord): history=None, is_being_edited=False): if history is None: - history = dict((func_id, frozenset([func_id])) for func_id in - resolved_functions) + history = {func_id: frozenset([func_id]) for func_id in + resolved_functions} - super(CallablesTable, self).__init__( + super().__init__( resolved_functions=resolved_functions, history=history, is_being_edited=is_being_edited) @@ -624,8 +621,8 @@ class CallablesTable(ImmutableRecord): def __hash__(self): return hash(( - frozenset(six.iteritems(self.resolved_functions)), - frozenset(six.iteritems(self.history)), + frozenset(self.resolved_functions.items()), + frozenset(self.history.items()), self.is_being_edited )) @@ -785,8 +782,8 @@ class CallablesTable(ImmutableRecord): # equal to the old version of the callable. return self, function else: - print('Old: ', self.resolved_functions[function.name]) - print('New: ', in_kernel_callable) + print("Old: ", self.resolved_functions[function.name]) + print("New: ", in_kernel_callable) raise LoopyError("Use 'with_enter_edit_callables_mode' first.") # }}} @@ -874,7 +871,7 @@ class CallablesTable(ImmutableRecord): # this implies that all the function instances having the name # "func_id" have been renamed to something else. for new_func_id in ( - six.viewkeys(new_callables_count)-six.viewkeys(renames_needed)): + new_callables_count.keys()-renames_needed.keys()): if old_func_id in self.history[new_func_id]: renames_needed[new_func_id] = old_func_id break @@ -931,13 +928,13 @@ class CallablesTable(ImmutableRecord): return item in self.resolved_functions def items(self): - return six.iteritems(self.resolved_functions) + return self.resolved_functions.items() def values(self): - return six.itervalues(self.resolved_functions) + return self.resolved_functions.values() def keys(self): - return six.iterkeys(self.resolved_functions) + return self.resolved_functions.keys() # }}} diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 5348443c66127baea5068c0bc5bd491abd4b4678..94bdef9043563d2a16d535d14a1eb4fa4f88e801 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import, print_function - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -23,7 +21,6 @@ THE SOFTWARE. """ -import six from pytools import ImmutableRecord import sys import islpy as isl @@ -39,6 +36,15 @@ import logging logger = logging.getLogger(__name__) +__doc__ = """ +.. currentmodule:: loopy.schedule + +.. autoclass:: ScheduleItem + +.. autoclass:: MinRecursionLimitForScheduling +""" + + # {{{ schedule items class ScheduleItem(ImmutableRecord): @@ -214,17 +220,17 @@ def find_loop_nest_with_map(kernel): from loopy.kernel.data import ConcurrentTag, IlpBaseTag - all_nonpar_inames = set( + all_nonpar_inames = { iname for iname in kernel.all_inames() if not kernel.iname_tags_of_type(iname, - (ConcurrentTag, IlpBaseTag))) + (ConcurrentTag, IlpBaseTag))} iname_to_insns = kernel.iname_to_insns() for iname in all_nonpar_inames: - result[iname] = set(other_iname + result[iname] = {other_iname for insn in iname_to_insns[iname] - for other_iname in kernel.insn_inames(insn) & all_nonpar_inames) + for other_iname in kernel.insn_inames(insn) & all_nonpar_inames} return result @@ -358,8 +364,7 @@ def gen_dependencies_except(kernel, insn_id, except_insn_ids): yield dep_id - for sub_dep_id in gen_dependencies_except(kernel, dep_id, except_insn_ids): - yield sub_dep_id + yield from gen_dependencies_except(kernel, dep_id, except_insn_ids) def get_priority_tiers(wanted, priorities): @@ -401,8 +406,7 @@ def get_priority_tiers(wanted, priorities): wanted = wanted - candidates # Yield recursively - for tier in get_priority_tiers(wanted, priorities): - yield tier + yield from get_priority_tiers(wanted, priorities) def sched_item_to_insn_id(sched_item): @@ -433,25 +437,25 @@ def format_insn(kernel, insn_id): from loopy.kernel.instruction import ( MultiAssignmentBase, NoOpInstruction, BarrierInstruction) if isinstance(insn, MultiAssignmentBase): - return "%s%s%s = %s%s%s {id=%s}" % ( + return "{}{}{} = {}{}{} {{id={}}}".format( Fore.CYAN, ", ".join(str(a) for a in insn.assignees), Style.RESET_ALL, Fore.MAGENTA, str(insn.expression), Style.RESET_ALL, format_insn_id(kernel, insn_id)) elif isinstance(insn, BarrierInstruction): - mem_kind = '' - if insn.synchronization_kind == 'local': - mem_kind = '{mem_kind=%s}' % insn.mem_kind + mem_kind = "" + if insn.synchronization_kind == "local": + mem_kind = "{mem_kind=%s}" % insn.mem_kind - return "[%s] %s... %sbarrier%s%s" % ( + return "[{}] {}... {}barrier{}{}".format( format_insn_id(kernel, insn_id), Fore.MAGENTA, insn.synchronization_kind[0], mem_kind, Style.RESET_ALL) elif isinstance(insn, NoOpInstruction): - return "[%s] %s... nop%s" % ( + return "[{}] {}... nop{}".format( format_insn_id(kernel, insn_id), Fore.MAGENTA, Style.RESET_ALL) else: - return "[%s] %s%s%s" % ( + return "[{}] {}{}{}".format( format_insn_id(kernel, insn_id), Fore.CYAN, str(insn), Style.RESET_ALL) @@ -470,7 +474,7 @@ def dump_schedule(kernel, schedule): lines.append(indent + "end %s" % sched_item.iname) elif isinstance(sched_item, CallKernel): lines.append(indent + - "CALL KERNEL %s(extra_args=%s, extra_inames=%s)" % ( + "CALL KERNEL {}(extra_args={}, extra_inames={})".format( sched_item.kernel_name, sched_item.extra_args, sched_item.extra_inames)) @@ -561,7 +565,7 @@ class ScheduleDebugInput(Exception): # }}} -# {{{ scheduling algorithm +# {{{ scheduler state class SchedulerState(ImmutableRecord): """ @@ -569,10 +573,6 @@ class SchedulerState(ImmutableRecord): .. attribute:: loop_nest_around_map - .. attribute:: loop_priority - - See :func:`loop_nest_around_map`. - .. attribute:: breakable_inames .. attribute:: ilp_inames @@ -586,6 +586,11 @@ class SchedulerState(ImmutableRecord): .. rubric:: Time-varying scheduler state + .. attribute:: insn_ids_to_try + + :class:`list` of unscheduled instruction ids in a decreasing priority + order. + .. attribute:: active_inames A tuple of active inames. @@ -637,10 +642,10 @@ class SchedulerState(ImmutableRecord): in them that are left to schedule. If a group name occurs in this mapping, that group is considered active. - .. attribute:: uses_of_boostability + .. attribute:: insns_in_topologically_sorted_order - Used to produce warnings about deprecated 'boosting' behavior - Should be removed along with boostability in 2017.x. + A list of loopy :class:`Instruction` objects in topologically sorted + order with instruction priorities as tie breaker. """ @property @@ -650,25 +655,172 @@ class SchedulerState(ImmutableRecord): else: return None +# }}} + + +def get_insns_in_topologically_sorted_order(kernel): + from pytools.graph import compute_topological_order + + rev_dep_map = {insn.id: set() for insn in kernel.instructions} + for insn in kernel.instructions: + for dep in insn.depends_on: + rev_dep_map[dep].add(insn.id) + + def key(insn_id): + # negative of insn.priority because + # pytools.graph.compute_topological_order schedules the nodes with + # lower 'key' first in case of a tie. + return (-kernel.id_to_insn[insn_id].priority, insn.id) + + ids = compute_topological_order(rev_dep_map, key=key) + return [kernel.id_to_insn[insn_id] for insn_id in ids] + + +# {{{ schedule_as_many_run_insns_as_possible + +def schedule_as_many_run_insns_as_possible(sched_state, template_insn): + """ + Returns an instance of :class:`loopy.schedule.SchedulerState`, by appending + all reachable instructions that are similar to *template_insn*. We define + two instructions to be similar if: + + * Both are within the same set of non-parallel inames. + * Both belong to the same groups. + * Both conflict with the same groups. + """ + + # {{{ bail when implementation is unsupported + + next_preschedule_item = ( + sched_state.preschedule[0] + if sched_state.preschedule + else None) + + if isinstance(next_preschedule_item, (CallKernel, ReturnFromKernel, + Barrier, EnterLoop, LeaveLoop)): + return sched_state + + if not sched_state.within_subkernel: + # cannot schedule RunInstructions when not in subkernel + return sched_state + + # }}} + + preschedule = sched_state.preschedule[:] + have_inames = template_insn.within_inames - sched_state.parallel_inames + toposorted_insns = sched_state.insns_in_topologically_sorted_order + + # {{{ helpers + + def next_preschedule_insn_id(): + return (next(iter(sched_item_to_insn_id(preschedule[0])), None) + if sched_state.preschedule + else None) + + def is_similar_to_template(insn): + if ((insn.within_inames - sched_state.parallel_inames) + != have_inames): + # sched_state.parallel_inames contains inames for which no + # EnterLoop/LeaveLoop nodes occur. + # FIXME: Should really rename that + return False + if insn.groups != template_insn.groups: + return False + if insn.conflicts_with_groups != template_insn.conflicts_with_groups: + return False + + return True + + # }}} + + # select the top instructions in toposorted_insns only which have active + # inames corresponding to those of sched_state + newly_scheduled_insn_ids = [] + ignored_unscheduled_insn_ids = set() + + # left_over_toposorted_insns: unscheduled insns in a topologically sorted order + left_over_toposorted_insns = [] + + for i, insn in enumerate(toposorted_insns): + assert insn.id not in sched_state.scheduled_insn_ids + + if is_similar_to_template(insn): + # check reachability + if not (insn.depends_on & ignored_unscheduled_insn_ids): + if insn.id in sched_state.prescheduled_insn_ids: + if next_preschedule_insn_id() == insn.id: + preschedule.pop(0) + newly_scheduled_insn_ids.append(insn.id) + continue + else: + newly_scheduled_insn_ids.append(insn.id) + continue + + left_over_toposorted_insns.append(insn) + ignored_unscheduled_insn_ids.add(insn.id) + + # HEURISTIC: To avoid quadratic operation complexity we bail out of + # adding new instructions by restricting the number of ignore + # unscheduled insns ids to 5. + # TODO: Find a stronger solution which would answer in O(1) time and + # O(N) space complexity when "no further instructions can be + # scheduled" i.e. when either: + # - No similar instructions are present in toposorted_insns. + # - No instruction in toposorted_insns is reachable due to instructions + # that were ignored. + if len(ignored_unscheduled_insn_ids) > 5: + left_over_toposorted_insns.extend(toposorted_insns[i+1:]) + break + + sched_items = tuple(RunInstruction(insn_id=insn_id) for insn_id in + newly_scheduled_insn_ids) + + updated_schedule = sched_state.schedule + sched_items + updated_scheduled_insn_ids = (sched_state.scheduled_insn_ids + | frozenset(newly_scheduled_insn_ids)) + updated_unscheduled_insn_ids = ( + sched_state.unscheduled_insn_ids + - frozenset(newly_scheduled_insn_ids)) + new_insn_ids_to_try = (None if newly_scheduled_insn_ids + else sched_state.insn_ids_to_try) + + new_active_group_counts = sched_state.active_group_counts.copy() + if newly_scheduled_insn_ids: + # all the newly scheduled insns belong to the same groups as + # template_insn + for grp in template_insn.groups: + new_active_group_counts[grp] -= len(newly_scheduled_insn_ids) + if new_active_group_counts[grp] == 0: + new_active_group_counts.pop(grp) + + return sched_state.copy( + schedule=updated_schedule, + scheduled_insn_ids=updated_scheduled_insn_ids, + unscheduled_insn_ids=updated_unscheduled_insn_ids, + preschedule=preschedule, + insn_ids_to_try=new_insn_ids_to_try, + active_group_counts=new_active_group_counts, + insns_in_topologically_sorted_order=left_over_toposorted_insns + ) + +# }}} + + +# {{{ scheduling algorithm def generate_loop_schedules_internal( - sched_state, allow_boost=False, debug=None): + sched_state, debug=None): # allow_insn is set to False initially and after entering each loop # to give loops containing high-priority instructions a chance. kernel = sched_state.kernel Fore = kernel.options._fore # noqa Style = kernel.options._style # noqa - if allow_boost is None: - rec_allow_boost = None - else: - rec_allow_boost = False - active_inames_set = frozenset(sched_state.active_inames) next_preschedule_item = ( sched_state.preschedule[0] - if len(sched_state.preschedule) > 0 + if sched_state.preschedule else None) # {{{ decide about debug mode @@ -693,11 +845,10 @@ def generate_loop_schedules_internal( print(75*"=") print("PRESCHEDULED ITEMS AWAITING SCHEDULING:") print(dump_schedule(sched_state.kernel, sched_state.preschedule)) - #print("boost allowed:", allow_boost) print(75*"=") print("LOOP NEST MAP (inner: outer):") - for iname, val in six.iteritems(sched_state.loop_nest_around_map): - print("%s : %s" % (iname, ", ".join(val))) + for iname, val in sched_state.loop_nest_around_map.items(): + print("{} : {}".format(iname, ", ".join(val))) print(75*"=") if debug.debug_length == len(debug.longest_rejected_schedule): @@ -712,30 +863,26 @@ def generate_loop_schedules_internal( if isinstance(next_preschedule_item, CallKernel): assert sched_state.within_subkernel is False - for result in generate_loop_schedules_internal( + yield from generate_loop_schedules_internal( sched_state.copy( schedule=sched_state.schedule + (next_preschedule_item,), preschedule=sched_state.preschedule[1:], within_subkernel=True, may_schedule_global_barriers=False, enclosing_subkernel_inames=sched_state.active_inames), - allow_boost=rec_allow_boost, - debug=debug): - yield result + debug=debug) if isinstance(next_preschedule_item, ReturnFromKernel): assert sched_state.within_subkernel is True # Make sure all subkernel inames have finished. if sched_state.active_inames == sched_state.enclosing_subkernel_inames: - for result in generate_loop_schedules_internal( + yield from generate_loop_schedules_internal( sched_state.copy( schedule=sched_state.schedule + (next_preschedule_item,), preschedule=sched_state.preschedule[1:], within_subkernel=False, may_schedule_global_barriers=True), - allow_boost=rec_allow_boost, - debug=debug): - yield result + debug=debug) # }}} @@ -748,13 +895,11 @@ def generate_loop_schedules_internal( if ( isinstance(next_preschedule_item, Barrier) and next_preschedule_item.originating_insn_id is None): - for result in generate_loop_schedules_internal( + yield from generate_loop_schedules_internal( sched_state.copy( schedule=sched_state.schedule + (next_preschedule_item,), preschedule=sched_state.preschedule[1:]), - allow_boost=rec_allow_boost, - debug=debug): - yield result + debug=debug) # }}} @@ -793,28 +938,11 @@ def generate_loop_schedules_internal( is_ready = insn.depends_on <= sched_state.scheduled_insn_ids if not is_ready: - if debug_mode: - # These are not that interesting when understanding scheduler - # failures. - - # print("instruction '%s' is missing insn depedencies '%s'" % ( - # format_insn(kernel, insn.id), ",".join( - # insn.depends_on - sched_state.scheduled_insn_ids))) - pass continue want = kernel.insn_inames(insn) - sched_state.parallel_inames have = active_inames_set - sched_state.parallel_inames - # If insn is boostable, it may be placed inside a more deeply - # nested loop without harm. - - orig_have = have - if allow_boost: - # Note that the inames in 'insn.boostable_into' necessarily won't - # be contained in 'want'. - have = have - insn.boostable_into - if want != have: is_ready = False @@ -908,7 +1036,7 @@ def generate_loop_schedules_internal( # }}} - # {{{ update instruction_ids_to_try + # {{{ update instruction_ids_to_try/toposorted_insns new_insn_ids_to_try = list(insn_ids_to_try) new_insn_ids_to_try.remove(insn.id) @@ -918,13 +1046,10 @@ def generate_loop_schedules_internal( sched_state.active_group_counts.keys()): new_insn_ids_to_try = None - # }}} + new_toposorted_insns = sched_state.insns_in_topologically_sorted_order[:] + new_toposorted_insns.remove(insn) - new_uses_of_boostability = [] - if allow_boost: - if orig_have & insn.boostable_into: - new_uses_of_boostability.append( - (insn.id, orig_have & insn.boostable_into)) + # }}} new_sched_state = sched_state.copy( scheduled_insn_ids=sched_state.scheduled_insn_ids | iid_set, @@ -937,17 +1062,18 @@ def generate_loop_schedules_internal( if insn_id not in sched_state.prescheduled_insn_ids else sched_state.preschedule[1:]), active_group_counts=new_active_group_counts, - uses_of_boostability=( - sched_state.uses_of_boostability - + new_uses_of_boostability) + insns_in_topologically_sorted_order=new_toposorted_insns, ) + new_sched_state = schedule_as_many_run_insns_as_possible(new_sched_state, + insn) + # Don't be eager about entering/leaving loops--if progress has been # made, revert to top of scheduler and see if more progress can be # made. for sub_sched in generate_loop_schedules_internal( new_sched_state, - allow_boost=rec_allow_boost, debug=debug): + debug=debug): yield sub_sched if not sched_state.group_insn_counts: @@ -989,12 +1115,10 @@ def generate_loop_schedules_internal( # outside of last_entered_loop. for subdep_id in gen_dependencies_except(kernel, insn_id, sched_state.scheduled_insn_ids): - subdep = kernel.id_to_insn[insn_id] want = (kernel.insn_inames(subdep_id) - sched_state.parallel_inames) if ( - last_entered_loop not in want and - last_entered_loop not in subdep.boostable_into): + last_entered_loop not in want): print( "%(warn)swarning:%(reset_all)s '%(iname)s', " "which the schedule is " @@ -1048,13 +1172,14 @@ def generate_loop_schedules_internal( sched_state.schedule + (LeaveLoop(iname=last_entered_loop),)), active_inames=sched_state.active_inames[:-1], + insn_ids_to_try=insn_ids_to_try, preschedule=( sched_state.preschedule if last_entered_loop not in sched_state.prescheduled_inames else sched_state.preschedule[1:]), ), - allow_boost=rec_allow_boost, debug=debug): + debug=debug): yield sub_sched return @@ -1083,7 +1208,7 @@ def generate_loop_schedules_internal( print("reachable insns:", ",".join(reachable_insn_ids)) print("active groups (with insn counts):", ",".join( "%s: %d" % (grp, c) - for grp, c in six.iteritems(sched_state.active_group_counts))) + for grp, c in sched_state.active_group_counts.items())) print(75*"-") if needed_inames: @@ -1165,11 +1290,11 @@ def generate_loop_schedules_internal( usefulness = None # highest insn priority enabled by iname - hypothetically_active_loops = active_inames_set | set([iname]) + hypothetically_active_loops = active_inames_set | {iname} for insn_id in reachable_insn_ids: insn = kernel.id_to_insn[insn_id] - want = kernel.insn_inames(insn) | insn.boostable_into + want = kernel.insn_inames(insn) if hypothetically_active_loops <= want: if usefulness is None: @@ -1193,7 +1318,7 @@ def generate_loop_schedules_internal( loop_priority_set = set().union(*[set(prio) for prio in sched_state.kernel.loop_priority]) - useful_loops_set = set(six.iterkeys(iname_to_usefulness)) + useful_loops_set = set(iname_to_usefulness.keys()) useful_and_desired = useful_loops_set & loop_priority_set if useful_and_desired: @@ -1264,12 +1389,12 @@ def generate_loop_schedules_internal( entered_inames=( sched_state.entered_inames | frozenset((iname,))), + insn_ids_to_try=insn_ids_to_try, preschedule=( sched_state.preschedule if iname not in sched_state.prescheduled_inames else sched_state.preschedule[1:]), ), - allow_boost=rec_allow_boost, debug=debug): found_viable_schedule = True yield sub_sched @@ -1281,7 +1406,7 @@ def generate_loop_schedules_internal( if debug_mode: print(75*"=") - inp = six.moves.input("Hit Enter for next schedule, " + inp = input("Hit Enter for next schedule, " "or enter a number to examine schedules of a " "different length:") if inp: @@ -1294,28 +1419,11 @@ def generate_loop_schedules_internal( # if done, yield result debug.log_success(sched_state.schedule) - for boost_insn_id, boost_inames in sched_state.uses_of_boostability: - warn_with_kernel( - kernel, "used_boostability", - "instruction '%s' was implicitly nested inside " - "inames '%s' based on an idempotence heuristic. " - "This is deprecated and will stop working in loopy 2017.x." - % (boost_insn_id, ", ".join(boost_inames)), - DeprecationWarning) - yield sched_state.schedule else: - if not allow_boost and allow_boost is not None: - # try again with boosting allowed - for sub_sched in generate_loop_schedules_internal( - sched_state, - allow_boost=True, debug=debug): - yield sub_sched - else: - # dead end - if debug is not None: - debug.log_dead_end(sched_state.schedule) + if debug is not None: + debug.log_dead_end(sched_state.schedule) # }}} @@ -1379,7 +1487,7 @@ class DependencyRecord(ImmutableRecord): var_kind=var_kind) -class DependencyTracker(object): +class DependencyTracker: """ A utility to help track dependencies between originating from a set of sources (as defined by :meth:`add_source`. For each target, @@ -1487,9 +1595,8 @@ class DependencyTracker(object): ("w", "any", self.base_access_map), ]: - for dep in self.get_conflicting_accesses( - target, tgt_dir, src_dir, src_base_var_to_accessor_map): - yield dep + yield from self.get_conflicting_accesses( + target, tgt_dir, src_dir, src_base_var_to_accessor_map) def get_conflicting_accesses(self, target, tgt_dir, src_dir, src_base_var_to_accessor_map): @@ -1503,11 +1610,11 @@ class DependencyTracker(object): dir_to_getter = {"w": get_written_names, "any": get_accessed_names} def filter_var_set_for_base_storage(var_name_set, base_storage_name): - return set( + return { name for name in var_name_set if (self.temp_to_base_storage.get(name, name) - == base_storage_name)) + == base_storage_name)} tgt_accessed_vars = dir_to_getter[tgt_dir](target) tgt_accessed_vars_base = self.map_to_base_storage(tgt_accessed_vars) @@ -1637,8 +1744,8 @@ def _insn_ids_reaching_end(schedule, kind, reverse): sched_item.synchronization_kind, kind): insn_ids_alive_at_scope[-1].clear() else: - insn_ids_alive_at_scope[-1] |= set( - insn_id for insn_id in sched_item_to_insn_id(sched_item)) + insn_ids_alive_at_scope[-1] |= { + insn_id for insn_id in sched_item_to_insn_id(sched_item)} assert len(insn_ids_alive_at_scope) == 1 return insn_ids_alive_at_scope[-1] @@ -1660,7 +1767,7 @@ def append_barrier_or_raise_error(kernel_name, schedule, dep, verify_only): dep.variable, dep.var_kind)) else: - comment = "for %s (%s)" % ( + comment = "for {} ({})".format( dep.variable, dep.dep_descr.format( tgt=dep.target.id, src=dep.source.id)) schedule.append(Barrier( @@ -1830,22 +1937,21 @@ def generate_loop_schedules(kernel, callables_table, debug_args={}): .. warning:: This function needs to be called inside (another layer) of a - :class:`MinRecursionLimitForScheduling` context manager, and the - context manager needs to end *after* the last reference to the + :class:`loopy.schedule.MinRecursionLimitForScheduling` context manager, + and the context manager needs to end *after* the last reference to the generators has gone out of scope. Otherwise, the high-recursion-limit generator chain may not be successfully garbage-collected and cause an internal error in the Python runtime. """ with MinRecursionLimitForScheduling(kernel): - for sched in generate_loop_schedules_inner(kernel, - callables_table, debug_args=debug_args): - yield sched + yield from generate_loop_schedules_inner(kernel, + callables_table, debug_args=debug_args) def generate_loop_schedules_inner(kernel, callables_table, debug_args={}): from loopy.kernel import KernelState - if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED): + if kernel.state not in (KernelState.PREPROCESSED, KernelState.LINEARIZED): raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") @@ -1856,32 +1962,32 @@ def generate_loop_schedules_inner(kernel, callables_table, debug_args={}): debug = ScheduleDebugger(**debug_args) - preschedule = kernel.schedule if kernel.state == KernelState.SCHEDULED else () + preschedule = kernel.schedule if kernel.state == KernelState.LINEARIZED else () - prescheduled_inames = set( + prescheduled_inames = { insn.iname for insn in preschedule - if isinstance(insn, EnterLoop)) + if isinstance(insn, EnterLoop)} - prescheduled_insn_ids = set( + prescheduled_insn_ids = { insn_id for item in preschedule - for insn_id in sched_item_to_insn_id(item)) + for insn_id in sched_item_to_insn_id(item)} from loopy.kernel.data import (IlpBaseTag, ConcurrentTag, VectorizeTag, filter_iname_tags_by_type) - ilp_inames = set( + ilp_inames = { iname - for iname, tags in six.iteritems(kernel.iname_to_tags) - if filter_iname_tags_by_type(tags, IlpBaseTag)) - vec_inames = set( + for iname, tags in kernel.iname_to_tags.items() + if filter_iname_tags_by_type(tags, IlpBaseTag)} + vec_inames = { iname - for iname, tags in six.iteritems(kernel.iname_to_tags) - if filter_iname_tags_by_type(tags, VectorizeTag)) - parallel_inames = set( + for iname, tags in kernel.iname_to_tags.items() + if filter_iname_tags_by_type(tags, VectorizeTag)} + parallel_inames = { iname - for iname, tags in six.iteritems(kernel.iname_to_tags) - if filter_iname_tags_by_type(tags, ConcurrentTag)) + for iname, tags in kernel.iname_to_tags.items() + if filter_iname_tags_by_type(tags, ConcurrentTag)} loop_nest_with_map = find_loop_nest_with_map(kernel) loop_nest_around_map = find_loop_nest_around_map(kernel) @@ -1906,9 +2012,9 @@ def generate_loop_schedules_inner(kernel, callables_table, debug_args={}): schedule=(), - unscheduled_insn_ids=set(insn.id for insn in kernel.instructions), + unscheduled_insn_ids={insn.id for insn in kernel.instructions}, scheduled_insn_ids=frozenset(), - within_subkernel=kernel.state != KernelState.SCHEDULED, + within_subkernel=kernel.state != KernelState.LINEARIZED, may_schedule_global_barriers=True, preschedule=preschedule, @@ -1920,15 +2026,15 @@ def generate_loop_schedules_inner(kernel, callables_table, debug_args={}): group_insn_counts=group_insn_counts(kernel), active_group_counts={}, - uses_of_boostability=[]) + insns_in_topologically_sorted_order=( + get_insns_in_topologically_sorted_order(kernel)), + ) schedule_gen_kwargs = {} - if kernel.options.ignore_boostable_into: - schedule_gen_kwargs["allow_boost"] = None def print_longest_dead_end(): if debug.interactive: - print("Loo.py will now show you the scheduler state at the point") + print("Loopy will now show you the scheduler state at the point") print("where the longest (dead-end) schedule was generated, in the") print("the hope that some of this makes sense and helps you find") print("the issue.") @@ -1937,7 +2043,7 @@ def generate_loop_schedules_inner(kernel, callables_table, debug_args={}): print(" debug_args=dict(interactive=False)") print("to generate_loop_schedules().") print(75*"-") - six.moves.input("Enter:") + input("Enter:") print() print() @@ -1978,11 +2084,11 @@ def generate_loop_schedules_inner(kernel, callables_table, debug_args={}): new_kernel = kernel.copy( schedule=gen_sched, - state=KernelState.SCHEDULED) + state=KernelState.LINEARIZED) from loopy.schedule.device_mapping import \ map_schedule_onto_host_or_device - if kernel.state != KernelState.SCHEDULED: + if kernel.state != KernelState.LINEARIZED: # Device mapper only gets run once. new_kernel = map_schedule_onto_host_or_device(new_kernel) @@ -2005,7 +2111,7 @@ def generate_loop_schedules_inner(kernel, callables_table, debug_args={}): debug.done_scheduling() if not schedule_count: print(75*"-") - print("ERROR: Sorry--loo.py did not find a schedule for your kernel.") + print("ERROR: Sorry--loopy did not find a schedule for your kernel.") print(75*"-") print_longest_dead_end() raise RuntimeError("no valid schedules found") diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py index 59afb07d2e9b7713dbe86c2c5aef7356decbbcff..089d4e600a13f8cf605b85fe29389bb28e39481a 100644 --- a/loopy/schedule/device_mapping.py +++ b/loopy/schedule/device_mapping.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import, print_function - __copyright__ = "Copyright (C) 2016 Matt Wala" __license__ = """ @@ -31,7 +29,7 @@ from loopy.schedule.tools import get_block_boundaries def map_schedule_onto_host_or_device(kernel): # FIXME: Should be idempotent. from loopy.kernel import KernelState - assert kernel.state == KernelState.SCHEDULED + assert kernel.state == KernelState.LINEARIZED from functools import partial device_prog_name_gen = partial( diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index e0129fd98417f26a501138a92de4a67614f1a139..afcdfb07bbde81b8211bff0909ae26a5a7a67a07 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import, print_function - __copyright__ = "Copyright (C) 2016 Matt Wala" __license__ = """ @@ -88,7 +86,7 @@ def add_extra_args_to_schedule(kernel): temporaries_read_in_subkernel(kernel, subkernel) | temporaries_written_in_subkernel(kernel, subkernel)) - more_args = set(tv + more_args = {tv for tv in used_temporaries if kernel.temporary_variables[tv].address_space @@ -96,7 +94,7 @@ def add_extra_args_to_schedule(kernel): and kernel.temporary_variables[tv].initializer is None and - tv not in sched_item.extra_args) + tv not in sched_item.extra_args} new_schedule.append(sched_item.copy( extra_args=sched_item.extra_args + sorted(more_args))) diff --git a/loopy/statistics.py b/loopy/statistics.py index 86f39e55bd0e5de2773ee3b5b42a08885191a9c6..a1c86d88bb6e8c97d757683d3fa2aebdee7f9a7a 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1,11 +1,10 @@ -from __future__ import division, absolute_import, print_function - __copyright__ = """ Copyright (C) 2015 James Stevens Copyright (C) 2018 Kaushik Kulkarni Copyright (C) 2019 Andreas Kloeckner """ + __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,9 +25,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from functools import partial -import six - import loopy as lp from islpy import dim_type import islpy as isl @@ -41,6 +37,7 @@ from pytools import ImmutableRecord, memoize_method from loopy.kernel.function_interface import CallableKernel from loopy.kernel import LoopKernel from loopy.program import make_program +from functools import partial __doc__ = """ @@ -100,7 +97,7 @@ def _get_param_tuple(obj): for i in range(obj.dim(dim_type.param))) -class GuardedPwQPolynomial(object): +class GuardedPwQPolynomial: def __init__(self, pwqpolynomial, valid_domain): self.pwqpolynomial = pwqpolynomial self.valid_domain = valid_domain @@ -153,7 +150,7 @@ class GuardedPwQPolynomial(object): @staticmethod def zero(): - p = isl.PwQPolynomial('{ 0 }') + p = isl.PwQPolynomial("{ 0 }") return GuardedPwQPolynomial(p, isl.Set.universe(p.domain().space)) def __str__(self): @@ -167,7 +164,7 @@ class GuardedPwQPolynomial(object): # {{{ ToCountMap -class ToCountMap(object): +class ToCountMap: """A map from work descriptors like :class:`Op` and :class:`MemAccess` to any arithmetic type. @@ -203,23 +200,28 @@ class ToCountMap(object): def __add__(self, other): result = self.count_map.copy() - for k, v in six.iteritems(other.count_map): + for k, v in other.count_map.items(): result[k] = self.count_map.get(k, 0) + v return self.copy(count_map=result) def __radd__(self, other): if other != 0: raise ValueError("ToCountMap: Attempted to add ToCountMap " - "to {0} {1}. ToCountMap may only be added to " + "to {} {}. ToCountMap may only be added to " "0 and other ToCountMap objects." .format(type(other), other)) return self def __mul__(self, other): - return self.copy(dict( - (index, value*other) - for index, value in six.iteritems(self.count_map))) + if isinstance(other, GuardedPwQPolynomial): + return self.copy({ + index: value*other + for index, value in self.count_map.items()}) + else: + raise ValueError("ToCountMap: Attempted to multiply " + "ToCountMap by {} {}." + .format(type(other), other)) __rmul__ = __mul__ @@ -231,8 +233,8 @@ class ToCountMap(object): def __str__(self): return "\n".join( - "%s: %s" % (k, v) - for k, v in sorted(six.iteritems(self.count_map), + f"{k}: {v}" + for k, v in sorted(self.count_map.items(), key=lambda k: str(k))) def __len__(self): @@ -257,9 +259,9 @@ class ToCountMap(object): return type(self)(count_map=count_map) def with_set_attributes(self, **kwargs): - return self.copy(count_map=dict( - (key.copy(**kwargs), val) - for key, val in six.iteritems(self.count_map))) + return self.copy(count_map={ + key.copy(**kwargs): val + for key, val in self.count_map.items()}) def filter_by(self, **kwargs): """Remove items without specified key fields. @@ -276,10 +278,10 @@ class ToCountMap(object): # (first create loopy kernel and specify array data types) - params = {'n': 512, 'm': 256, 'l': 128} + params = {"n": 512, "m": 256, "l": 128} mem_map = lp.get_mem_access_map(knl) - filtered_map = mem_map.filter_by(direction=['load'], - variable=['a','g']) + filtered_map = mem_map.filter_by(direction=["load"], + variable=["a","g"]) tot_loads_a_g = filtered_map.eval_and_sum(params) # (now use these counts to, e.g., predict performance) @@ -292,16 +294,16 @@ class ToCountMap(object): pass new_kwargs = {} - for arg_field, allowable_vals in six.iteritems(kwargs): + for arg_field, allowable_vals in kwargs.items(): if arg_field == "dtype": from loopy.types import to_loopy_type allowable_vals = [to_loopy_type(dtype) for dtype in allowable_vals] new_kwargs[arg_field] = allowable_vals - for key, val in six.iteritems(self.count_map): + for key, val in self.count_map.items(): if all(getattr(key, arg_field, _Sentinel) in allowable_vals - for arg_field, allowable_vals in six.iteritems(new_kwargs)): + for arg_field, allowable_vals in new_kwargs.items()): new_count_map[key] = val return self.copy(count_map=new_count_map) @@ -319,7 +321,7 @@ class ToCountMap(object): # (first create loopy kernel and specify array data types) - params = {'n': 512, 'm': 256, 'l': 128} + params = {"n": 512, "m": 256, "l": 128} mem_map = lp.get_mem_access_map(knl) def filter_func(key): return key.lid_strides[0] > 1 and key.lid_strides[0] <= 4: @@ -333,7 +335,7 @@ class ToCountMap(object): new_count_map = {} - for self_key, self_val in six.iteritems(self.count_map): + for self_key, self_val in self.count_map.items(): if func(self_key): new_count_map[self_key] = self_val @@ -353,29 +355,29 @@ class ToCountMap(object): # (first create loopy kernel and specify array data types) - params = {'n': 512, 'm': 256, 'l': 128} + params = {"n": 512, "m": 256, "l": 128} mem_map = get_mem_access_map(knl) - grouped_map = mem_map.group_by('mtype', 'dtype', 'direction') + grouped_map = mem_map.group_by("mtype", "dtype", "direction") - f32_global_ld = grouped_map[MemAccess(mtype='global', + f32_global_ld = grouped_map[MemAccess(mtype="global", dtype=np.float32, - direction='load') + direction="load") ].eval_with_dict(params) - f32_global_st = grouped_map[MemAccess(mtype='global', + f32_global_st = grouped_map[MemAccess(mtype="global", dtype=np.float32, - direction='store') + direction="store") ].eval_with_dict(params) - f32_local_ld = grouped_map[MemAccess(mtype='local', + f32_local_ld = grouped_map[MemAccess(mtype="local", dtype=np.float32, - direction='load') + direction="load") ].eval_with_dict(params) - f32_local_st = grouped_map[MemAccess(mtype='local', + f32_local_st = grouped_map[MemAccess(mtype="local", dtype=np.float32, - direction='store') + direction="store") ].eval_with_dict(params) op_map = get_op_map(knl) - ops_dtype = op_map.group_by('dtype') + ops_dtype = op_map.group_by("dtype") f32ops = ops_dtype[Op(dtype=np.float32)].eval_with_dict(params) f64ops = ops_dtype[Op(dtype=np.float64)].eval_with_dict(params) @@ -396,11 +398,11 @@ class ToCountMap(object): else: return self - for self_key, self_val in six.iteritems(self.count_map): + for self_key, self_val in self.count_map.items(): new_key = key_type( - **dict( - (field, getattr(self_key, field)) - for field in args)) + **{ + field: getattr(self_key, field) + for field in args}) new_count_map[new_key] = new_count_map.get(new_key, 0) + self_val @@ -418,20 +420,20 @@ class ToCountMap(object): # (first create loopy kernel and specify array data types) bytes_map = get_mem_access_map(knl).to_bytes() - params = {'n': 512, 'm': 256, 'l': 128} + params = {"n": 512, "m": 256, "l": 128} s1_g_ld_byt = bytes_map.filter_by( - mtype=['global'], lid_strides={0: 1}, - direction=['load']).eval_and_sum(params) + mtype=["global"], lid_strides={0: 1}, + direction=["load"]).eval_and_sum(params) s2_g_ld_byt = bytes_map.filter_by( - mtype=['global'], lid_strides={0: 2}, - direction=['load']).eval_and_sum(params) + mtype=["global"], lid_strides={0: 2}, + direction=["load"]).eval_and_sum(params) s1_g_st_byt = bytes_map.filter_by( - mtype=['global'], lid_strides={0: 1}, - direction=['store']).eval_and_sum(params) + mtype=["global"], lid_strides={0: 1}, + direction=["store"]).eval_and_sum(params) s2_g_st_byt = bytes_map.filter_by( - mtype=['global'], lid_strides={0: 2}, - direction=['store']).eval_and_sum(params) + mtype=["global"], lid_strides={0: 2}, + direction=["store"]).eval_and_sum(params) # (now use these counts to, e.g., predict performance) @@ -439,7 +441,7 @@ class ToCountMap(object): new_count_map = {} - for key, val in six.iteritems(self.count_map): + for key, val in self.count_map.items(): new_count_map[key] = int(key.dtype.itemsize) * val return self.copy(new_count_map) @@ -449,7 +451,7 @@ class ToCountMap(object): total = self._zero() - for k, v in six.iteritems(self.count_map): + for k, v in self.count_map.items(): total += v return total @@ -475,7 +477,7 @@ class ToCountPolynomialMap(ToCountMap): space_param_tuple = _get_param_tuple(space) - for key, val in six.iteritems(count_map): + for key, val in count_map.items(): if isinstance(val, isl.PwQPolynomial): assert val.dim(dim_type.out) == 1 elif isinstance(val, GuardedPwQPolynomial): @@ -485,7 +487,7 @@ class ToCountPolynomialMap(ToCountMap): assert _get_param_tuple(val.space) == space_param_tuple - super(ToCountPolynomialMap, self).__init__(count_map) + super().__init__(count_map) def _zero(self): space = self.space.insert_dims(dim_type.out, 0, 1) @@ -510,10 +512,10 @@ class ToCountPolynomialMap(ToCountMap): # (first create loopy kernel and specify array data types) - params = {'n': 512, 'm': 256, 'l': 128} + params = {"n": 512, "m": 256, "l": 128} mem_map = lp.get_mem_access_map(knl) - filtered_map = mem_map.filter_by(direction=['load'], - variable=['a', 'g']) + filtered_map = mem_map.filter_by(direction=["load"], + variable=["a", "g"]) tot_loads_a_g = filtered_map.eval_and_sum(params) # (now use these counts to, e.g., predict performance) @@ -549,7 +551,7 @@ def subst_into_guarded_pwqpolynomial(new_space, guarded_poly, subst_dict): def subst_into_to_count_map(space, tcm, subst_dict): from loopy.isl_helpers import subst_into_pwqpolynomial new_count_map = {} - for key, value in six.iteritems(tcm.count_map): + for key, value in tcm.count_map.items(): if isinstance(value, GuardedPwQPolynomial): new_count_map[key] = subst_into_guarded_pwqpolynomial( space, value, subst_dict) @@ -576,13 +578,13 @@ def stringify_stats_mapping(m): result = "" for key in sorted(m.keys(), key=lambda k: str(k)): - result += ("%s : %s\n" % (key, m[key])) + result += ("{} : {}\n".format(key, m[key])) return result # {{{ CountGranularity -class CountGranularity(object): +class CountGranularity: """Strings specifying whether an operation should be counted once per *work-item*, *sub-group*, or *work-group*. @@ -618,7 +620,7 @@ class Op(ImmutableRecord): .. attribute:: dtype - A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the + A :class:`loopy.types.LoopyType` or :class:`numpy.dtype` that specifies the data type operated on. .. attribute:: name @@ -632,7 +634,7 @@ class Op(ImmutableRecord): once per *work-item*, *sub-group*, or *work-group*. The granularities allowed can be found in :class:`CountGranularity`, and may be accessed, e.g., as ``CountGranularity.WORKITEM``. A work-item is a single instance - of computation executing on a single processor (think 'thread'), a + of computation executing on a single processor (think "thread"), a collection of which may be grouped together into a work-group. Each work-group executes on a single compute unit with all work-items within the work-group sharing local memory. A sub-group is an @@ -656,17 +658,17 @@ class Op(ImmutableRecord): from loopy.types import to_loopy_type dtype = to_loopy_type(dtype) - super(Op, self).__init__(dtype=dtype, name=name, + super().__init__(dtype=dtype, name=name, count_granularity=count_granularity, kernel_name=kernel_name) def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness if self.kernel_name is not None: - return "Op(%s, %s, %s, %s)" % ( - self.dtype, self.name, self.count_granularity, self.kernel_name) + return (f"Op({self.dtype}, {self.name}, {self.count_granularity}," + f" {self.kernel_name})") else: - return "Op(%s, %s, %s)" % (self.dtype, self.name, self.count_granularity) + return f"Op({self.dtype}, {self.name}, {self.count_granularity})" # }}} @@ -683,7 +685,7 @@ class MemAccess(ImmutableRecord): .. attribute:: dtype - A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the + A :class:`loopy.types.LoopyType` or :class:`numpy.dtype` that specifies the data type accessed. .. attribute:: lid_strides @@ -717,7 +719,7 @@ class MemAccess(ImmutableRecord): .. attribute:: variable_tag A :class:`str` that specifies the variable tag of a - :class:`pymbolic.primitives.TaggedVariable`. + :class:`loopy.symbolic.TaggedVariable`. .. attribute:: count_granularity @@ -725,7 +727,7 @@ class MemAccess(ImmutableRecord): once per *work-item*, *sub-group*, or *work-group*. The granularities allowed can be found in :class:`CountGranularity`, and may be accessed, e.g., as ``CountGranularity.WORKITEM``. A work-item is a single instance - of computation executing on a single processor (think 'thread'), a + of computation executing on a single processor (think "thread"), a collection of which may be grouped together into a work-group. Each work-group executes on a single compute unit with all work-items within the work-group sharing local memory. A sub-group is an @@ -750,7 +752,7 @@ class MemAccess(ImmutableRecord): from loopy.types import to_loopy_type dtype = to_loopy_type(dtype) - super(MemAccess, self).__init__(mtype=mtype, dtype=dtype, + super().__init__(mtype=mtype, dtype=dtype, lid_strides=lid_strides, gid_strides=gid_strides, direction=direction, variable=variable, variable_tag=variable_tag, @@ -763,13 +765,13 @@ class MemAccess(ImmutableRecord): def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness - return "MemAccess(%s, %s, %s, %s, %s, %s, %s, %s, %s)" % ( + return "MemAccess({}, {}, {}, {}, {}, {}, {}, {}, {})".format( self.mtype, self.dtype, None if self.lid_strides is None else dict( - sorted(six.iteritems(self.lid_strides))), + sorted(self.lid_strides.items())), None if self.gid_strides is None else dict( - sorted(six.iteritems(self.gid_strides))), + sorted(self.gid_strides.items())), self.direction, self.variable, self.variable_tag, @@ -795,11 +797,11 @@ class Sync(ImmutableRecord): """ def __init__(self, kind=None, kernel_name=None): - super(Sync, self).__init__(kind=kind, kernel_name=kernel_name) + super().__init__(kind=kind, kernel_name=kernel_name) def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness - return "Sync(%s, %s)" % (self.kind, self.kernel_name) + return f"Sync({self.kind}, {self.kernel_name})" # }}} @@ -844,12 +846,12 @@ class CounterBase(CombineMapper): if isinstance(clbl, CallableKernel): sub_result = self.kernel_rec(clbl.subkernel) - arg_dict = dict( - (arg.name, value) + arg_dict = { + arg.name: value for arg, value in zip( clbl.subkernel.args, expr.parameters) - if isinstance(arg, ValueArg)) + if isinstance(arg, ValueArg)} return subst_into_to_count_map( self.param_space, @@ -909,7 +911,7 @@ class CounterBase(CombineMapper): class ExpressionOpCounter(CounterBase): def __init__(self, knl, callables_table, kernel_rec, count_within_subscripts=True): - super(ExpressionOpCounter, self).__init__( + super().__init__( knl, callables_table, kernel_rec) self.count_within_subscripts = count_within_subscripts @@ -933,12 +935,12 @@ class ExpressionOpCounter(CounterBase): if not isinstance(clbl, CallableKernel): return self.new_poly_map( {Op(dtype=self.type_inf(expr), - name='func:'+clbl.name, + name="func:"+clbl.name, count_granularity=self.arithmetic_count_granularity, kernel_name=self.knl.name): self.one} ) + self.rec(expr.parameters) else: - return super(ExpressionOpCounter, self).map_call(expr) + return super().map_call(expr) def map_subscript(self, expr): if self.count_within_subscripts: @@ -954,7 +956,7 @@ class ExpressionOpCounter(CounterBase): assert expr.children return self.new_poly_map( {Op(dtype=self.type_inf(expr), - name='add', + name="add", count_granularity=self.arithmetic_count_granularity, kernel_name=self.knl.name): self.zero + (len(expr.children)-1)} @@ -964,7 +966,7 @@ class ExpressionOpCounter(CounterBase): from pymbolic.primitives import is_zero assert expr.children return sum(self.new_poly_map({Op(dtype=self.type_inf(expr), - name='mul', + name="mul", count_granularity=( self.arithmetic_count_granularity), kernel_name=self.knl.name): self.one}) @@ -972,14 +974,14 @@ class ExpressionOpCounter(CounterBase): for child in expr.children if not is_zero(child + 1)) + \ self.new_poly_map({Op(dtype=self.type_inf(expr), - name='mul', + name="mul", count_granularity=( self.arithmetic_count_granularity), kernel_name=self.knl.name): -self.one}) def map_quotient(self, expr, *args): return self.new_poly_map({Op(dtype=self.type_inf(expr), - name='div', + name="div", count_granularity=self.arithmetic_count_granularity, kernel_name=self.knl.name): self.one}) \ + self.rec(expr.numerator) \ @@ -990,7 +992,7 @@ class ExpressionOpCounter(CounterBase): def map_power(self, expr): return self.new_poly_map({Op(dtype=self.type_inf(expr), - name='pow', + name="pow", count_granularity=self.arithmetic_count_granularity, kernel_name=self.knl.name): self.one}) \ + self.rec(expr.base) \ @@ -998,7 +1000,7 @@ class ExpressionOpCounter(CounterBase): def map_left_shift(self, expr): return self.new_poly_map({Op(dtype=self.type_inf(expr), - name='shift', + name="shift", count_granularity=self.arithmetic_count_granularity, kernel_name=self.knl.name): self.one}) \ + self.rec(expr.shiftee) \ @@ -1008,14 +1010,14 @@ class ExpressionOpCounter(CounterBase): def map_bitwise_not(self, expr): return self.new_poly_map({Op(dtype=self.type_inf(expr), - name='bw', + name="bw", count_granularity=self.arithmetic_count_granularity, kernel_name=self.knl.name): self.one}) \ + self.rec(expr.child) def map_bitwise_or(self, expr): return self.new_poly_map({Op(dtype=self.type_inf(expr), - name='bw', + name="bw", count_granularity=self.arithmetic_count_granularity, kernel_name=self.knl.name): self.zero + (len(expr.children)-1)}) \ @@ -1040,7 +1042,7 @@ class ExpressionOpCounter(CounterBase): def map_min(self, expr): return self.new_poly_map({Op(dtype=self.type_inf(expr), - name='maxmin', + name="maxmin", count_granularity=self.arithmetic_count_granularity, kernel_name=self.knl.name): len(expr.children)-1}) \ @@ -1119,14 +1121,22 @@ def _get_lid_and_gid_strides(knl, array, index): def get_iname_strides(tag_to_iname_dict): tag_to_stride_dict = {} - for tag, iname in six.iteritems(tag_to_iname_dict): + + if array.dim_tags is None: + assert len(index) <= 1 + dim_tags = (None,) * len(index) + else: + dim_tags = array.dim_tags + + for tag, iname in tag_to_iname_dict.items(): total_iname_stride = 0 # find total stride of this iname for each axis - for idx, axis_tag in zip(index, array.dim_tags): + for idx, axis_tag in zip(index, dim_tags): # collect index coefficients try: - coeffs = _IndexStrideCoefficientCollector()( - simplify_using_aff(knl, idx)) + coeffs = _IndexStrideCoefficientCollector( + [tag_to_iname_dict[tag]])( + simplify_using_aff(knl, idx)) except ExpressionNotAffineError: total_iname_stride = None break @@ -1142,6 +1152,14 @@ def _get_lid_and_gid_strides(knl, array, index): # now determine stride if isinstance(axis_tag, FixedStrideArrayDimTag): axis_tag_stride = axis_tag.stride + + if axis_tag_stride is lp.auto: + total_iname_stride = None + break + + elif axis_tag is None: + axis_tag_stride = 1 + else: continue @@ -1172,7 +1190,7 @@ class MemAccessCounterBase(CounterBase): if not isinstance(clbl, CallableKernel): return self.rec(expr.parameters) else: - return super(MemAccessCounterBase, self).map_call(expr) + return super().map_call(expr) # }}} @@ -1191,7 +1209,7 @@ class LocalMemAccessCounter(MemAccessCounterBase): if index is None: # no subscript count_map[MemAccess( - mtype='local', + mtype="local", dtype=dtype, count_granularity=self.local_mem_count_granularity, kernel_name=self.knl.name)] = self.one @@ -1208,10 +1226,10 @@ class LocalMemAccessCounter(MemAccessCounterBase): self.knl, array, index_tuple) count_map[MemAccess( - mtype='local', + mtype="local", dtype=dtype, - lid_strides=dict(sorted(six.iteritems(lid_strides))), - gid_strides=dict(sorted(six.iteritems(gid_strides))), + lid_strides=dict(sorted(lid_strides.items())), + gid_strides=dict(sorted(gid_strides.items())), variable=name, count_granularity=self.local_mem_count_granularity, kernel_name=self.knl.name)] = self.one @@ -1249,7 +1267,7 @@ class GlobalMemAccessCounter(MemAccessCounterBase): # this array is not in global memory return self.new_zero_poly_map() - return self.new_poly_map({MemAccess(mtype='global', + return self.new_poly_map({MemAccess(mtype="global", dtype=self.type_inf(expr), lid_strides={}, gid_strides={}, variable=name, count_granularity=CountGranularity.WORKITEM, @@ -1292,10 +1310,10 @@ class GlobalMemAccessCounter(MemAccessCounterBase): ) else global_access_count_granularity return self.new_poly_map({MemAccess( - mtype='global', + mtype="global", dtype=self.type_inf(expr), - lid_strides=dict(sorted(six.iteritems(lid_strides))), - gid_strides=dict(sorted(six.iteritems(gid_strides))), + lid_strides=dict(sorted(lid_strides.items())), + gid_strides=dict(sorted(gid_strides.items())), variable=name, variable_tag=var_tag, count_granularity=count_granularity, @@ -1321,7 +1339,7 @@ class AccessFootprintGatherer(CombineMapper): def merge_dicts(a, b): result = a.copy() - for var_name, footprint in six.iteritems(b): + for var_name, footprint in b.items(): if var_name in result: result[var_name] = result[var_name] | footprint else: @@ -1641,7 +1659,7 @@ def _get_op_map_for_single_kernel(knl, callables_table, for insn in knl.instructions: if isinstance(insn, (CallInstruction, CInstruction, Assignment)): ops = op_counter(insn.assignees) + op_counter(insn.expression) - for key, val in six.iteritems(ops.count_map): + for key, val in ops.count_map.items(): count = _get_insn_count(knl, callables_table, insn.id, subgroup_size, count_redundant_work, key.count_granularity) @@ -1673,14 +1691,14 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, count operations inside array indices. :arg subgroup_size: (currently unused) An :class:`int`, :class:`str` - ``'guess'``, or *None* that specifies the sub-group size. An OpenCL + ``"guess"``, or *None* that specifies the sub-group size. An OpenCL sub-group is an implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when counting a :class:`MemAccess` whose count_granularity specifies that it should only be counted once per sub-group. If set to *None* an attempt to find the sub-group size using the device will be made, if this fails an error will be raised. If a :class:`str` - ``'guess'`` is passed as the subgroup_size, :func:`get_op_map` will + ``"guess"`` is passed as the subgroup_size, get_mem_access_map will attempt to find the sub-group size using the device and, if unsuccessful, will make a wild guess. @@ -1699,13 +1717,13 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, # (first create loopy kernel and specify array data types) op_map = get_op_map(knl) - params = {'n': 512, 'm': 256, 'l': 128} + params = {"n": 512, "m": 256, "l": 128} f32add = op_map[Op(np.float32, - 'add', + "add", count_granularity=CountGranularity.WORKITEM) ].eval_with_dict(params) f32mul = op_map[Op(np.float32, - 'mul', + "mul", count_granularity=CountGranularity.WORKITEM) ].eval_with_dict(params) @@ -1746,7 +1764,7 @@ def _find_subgroup_size_for_knl(knl): subgroup_size_guess = get_simd_group_size(knl.target.device, None) warn_with_kernel(knl, "getting_subgroup_size_from_device", "Device: %s. Using sub-group size given by " - "pyopencl.characterize.get_simd_group_size(): %d" + "pyopencl.characterize.get_simd_group_size(): %s" % (knl.target.device, subgroup_size_guess)) return subgroup_size_guess else: @@ -1764,7 +1782,7 @@ def _process_subgroup_size(knl, subgroup_size_requested): if subgroup_size_requested is None: if subgroup_size_guess is None: - # 'guess' was not passed and either no target device found + # "guess" was not passed and either no target device found # or get_simd_group_size returned None raise ValueError("No sub-group size passed, no target device found. " "Either (1) pass integer value for subgroup_size, " @@ -1774,7 +1792,7 @@ def _process_subgroup_size(knl, subgroup_size_requested): else: return subgroup_size_guess - elif subgroup_size_requested == 'guess': + elif subgroup_size_requested == "guess": if subgroup_size_guess is None: # unable to get subgroup_size from device, so guess subgroup_size_guess = 32 @@ -1831,7 +1849,7 @@ def _get_mem_access_map_for_single_kernel(knl, callables_table, + access_counter_l(insn.assignee) ).with_set_attributes(direction="store") - for key, val in six.iteritems(insn_access_map.count_map): + for key, val in insn_access_map.count_map.items(): count = _get_insn_count(knl, callables_table, insn.id, subgroup_size, count_redundant_work, key.count_granularity) @@ -1860,14 +1878,14 @@ def get_mem_access_map(program, numpy_types=None, count_redundant_work=False, (Likely desirable for performance modeling, but undesirable for code optimization.) - :arg subgroup_size: An :class:`int`, :class:`str` ``'guess'``, or + :arg subgroup_size: An :class:`int`, :class:`str` ``"guess"``, or *None* that specifies the sub-group size. An OpenCL sub-group is an implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when counting a :class:`MemAccess` whose count_granularity specifies that it should only be counted once per sub-group. If set to *None* an attempt to find the sub-group size using the device will be made, if this fails - an error will be raised. If a :class:`str` ``'guess'`` is passed as + an error will be raised. If a :class:`str` ``"guess"`` is passed as the subgroup_size, get_mem_access_map will attempt to find the sub-group size using the device and, if unsuccessful, will make a wild guess. @@ -1886,43 +1904,43 @@ def get_mem_access_map(program, numpy_types=None, count_redundant_work=False, # (first create loopy kernel and specify array data types) - params = {'n': 512, 'm': 256, 'l': 128} + params = {"n": 512, "m": 256, "l": 128} mem_map = get_mem_access_map(knl) f32_s1_g_ld_a = mem_map[MemAccess( - mtype='global', + mtype="global", dtype=np.float32, lid_strides={0: 1}, gid_strides={0: 256}, - direction='load', - variable='a', + direction="load", + variable="a", count_granularity=CountGranularity.WORKITEM) ].eval_with_dict(params) f32_s1_g_st_a = mem_map[MemAccess( - mtype='global', + mtype="global", dtype=np.float32, lid_strides={0: 1}, gid_strides={0: 256}, - direction='store', - variable='a', + direction="store", + variable="a", count_granularity=CountGranularity.WORKITEM) ].eval_with_dict(params) f32_s1_l_ld_x = mem_map[MemAccess( - mtype='local', + mtype="local", dtype=np.float32, lid_strides={0: 1}, gid_strides={0: 256}, - direction='load', - variable='x', + direction="load", + variable="x", count_granularity=CountGranularity.WORKITEM) ].eval_with_dict(params) f32_s1_l_st_x = mem_map[MemAccess( - mtype='local', + mtype="local", dtype=np.float32, lid_strides={0: 1}, gid_strides={0: 256}, - direction='store', - variable='x', + direction="store", + variable="x", count_granularity=CountGranularity.WORKITEM) ].eval_with_dict(params) @@ -1954,10 +1972,6 @@ def get_mem_access_map(program, numpy_types=None, count_redundant_work=False, def _get_synchronization_map_for_single_kernel(knl, callables_table, subgroup_size=None): - if not knl.options.ignore_boostable_into: - raise LoopyError("Kernel '%s': Using operation counting requires the option " - "ignore_boostable_into to be set." % knl.name) - knl = lp.get_one_scheduled_kernel(knl, callables_table) from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, @@ -2011,14 +2025,14 @@ def get_synchronization_map(program, subgroup_size=None): :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :arg subgroup_size: (currently unused) An :class:`int`, :class:`str` - ``'guess'``, or *None* that specifies the sub-group size. An OpenCL + ``"guess"``, or *None* that specifies the sub-group size. An OpenCL sub-group is an implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when counting a :class:`MemAccess` whose count_granularity specifies that it should only be counted once per sub-group. If set to *None* an attempt to find the sub-group size using the device will be made, if this fails an error will be raised. If a :class:`str` - ``'guess'`` is passed as the subgroup_size, get_mem_access_map will + ``"guess"`` is passed as the subgroup_size, get_mem_access_map will attempt to find the sub-group size using the device and, if unsuccessful, will make a wild guess. @@ -2034,8 +2048,8 @@ def get_synchronization_map(program, subgroup_size=None): # (first create loopy kernel and specify array data types) sync_map = get_synchronization_map(knl) - params = {'n': 512, 'm': 256, 'l': 128} - barrier_ct = sync_map['barrier_local'].eval_with_dict(params) + params = {"n": 512, "m": 256, "l": 128} + barrier_ct = sync_map["barrier_local"].eval_with_dict(params) # (now use this count to, e.g., predict performance) @@ -2119,10 +2133,10 @@ def gather_access_footprints(program, ignore_uncountable=False): result = {} - for vname, footprint in six.iteritems(write_footprints): + for vname, footprint in write_footprints.items(): result[(vname, "write")] = footprint - for vname, footprint in six.iteritems(read_footprints): + for vname, footprint in read_footprints.items(): result[(vname, "read")] = footprint return result @@ -2166,5 +2180,4 @@ def gather_access_footprint_bytes(program, ignore_uncountable=False): # }}} - # vim: foldmethod=marker diff --git a/loopy/symbolic.py b/loopy/symbolic.py index b8341bcd15b76eeb1705b8ca26d69237a6b97c0d..165b8ea4415547f5557e73ef987c4348493376d9 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1,6 +1,5 @@ """Pymbolic mappers for loopy.""" -from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -25,8 +24,8 @@ THE SOFTWARE. """ -import six -from six.moves import range, zip, reduce, intern +from functools import reduce +from sys import intern from pytools import memoize, memoize_method, ImmutableRecord import pytools.lex @@ -65,10 +64,36 @@ from islpy import dim_type import re import numpy as np +__doc__ = """ +.. currentmodule:: loopy.symbolic + +.. autoclass:: Literal + +.. autoclass:: ArrayLiteral + +.. autoclass:: FunctionIdentifier + +.. autoclass:: TypedCSE + +.. autoclass:: TypeCast + +.. autoclass:: TaggedVariable + +.. autoclass:: Reduction + +.. autoclass:: LinearSubscript + +.. autoclass:: RuleArgument + +.. autoclass:: ExpansionState + +.. autoclass:: RuleAwareIdentityMapper +""" + # {{{ mappers with support for loopy-specific primitives -class IdentityMapperMixin(object): +class IdentityMapperMixin: def map_literal(self, expr, *args, **kwargs): return expr @@ -232,13 +257,13 @@ class StringifyMapper(StringifyMapperBase): def map_reduction(self, expr, prec): from pymbolic.mapper.stringifier import PREC_NONE - return "%sreduce(%s, [%s], %s)" % ( + return "{}reduce({}, [{}], {})".format( "simul_" if expr.allow_simultaneous else "", expr.operation, ", ".join(expr.inames), self.rec(expr.expr, PREC_NONE)) def map_tagged_variable(self, expr, prec): - return "%s$%s" % (expr.name, expr.tag) + return f"{expr.name}${expr.tag}" def map_linear_subscript(self, expr, enclosing_prec): from pymbolic.mapper.stringifier import PREC_CALL, PREC_NONE @@ -249,7 +274,7 @@ class StringifyMapper(StringifyMapperBase): enclosing_prec, PREC_CALL) def map_loopy_function_identifier(self, expr, enclosing_prec): - return "%s<%s>" % ( + return "{}<{}>".format( type(expr).__name__, ", ".join(str(a) for a in expr.__getinitargs__())) @@ -258,14 +283,15 @@ class StringifyMapper(StringifyMapperBase): def map_type_cast(self, expr, enclosing_prec): from pymbolic.mapper.stringifier import PREC_NONE - return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE)) + return "cast({}, {})".format( + repr(expr.type), self.rec(expr.child, PREC_NONE)) def map_resolved_function(self, expr, prec): return expr.name def map_sub_array_ref(self, expr, prec): return "[{inames}]: {subscr}".format( - inames=','.join(self.rec(iname, prec) for iname in + inames=",".join(self.rec(iname, prec) for iname in expr.swept_inames), subscr=self.rec(expr.subscript, prec)) @@ -282,7 +308,7 @@ class EqualityPreservingStringifyMapper(StringifyMapperBase): """ def __init__(self): - super(EqualityPreservingStringifyMapper, self).__init__() + super().__init__() def map_constant(self, expr, enclosing_prec): if isinstance(expr, np.generic): @@ -290,7 +316,7 @@ class EqualityPreservingStringifyMapper(StringifyMapperBase): # FIXME: This syntax cannot currently be parsed. - return "%s(%s)" % (type(expr).__name__, repr(expr)) + return "{}({})".format(type(expr).__name__, repr(expr)) else: result = repr(expr) @@ -308,8 +334,7 @@ class UnidirectionalUnifier(UnidirectionalUnifierBase): if not isinstance(other, type(expr)): return self.treat_mismatch(expr, other, unis) if (expr.inames != other.inames - or type(expr.function) != type(other.function) # noqa - ): + or type(expr.function) != type(other.function)): # noqa return [] return self.rec(expr.expr, other.expr, unis) @@ -353,11 +378,10 @@ class DependencyMapper(DependencyMapperBase): def map_reduction(self, expr, *args, **kwargs): deps = self.rec(expr.expr, *args, **kwargs) - - return deps - set(p.Variable(iname) for iname in expr.inames) + return deps - {p.Variable(iname) for iname in expr.inames} def map_tagged_variable(self, expr, *args, **kwargs): - return set([expr]) + return {expr} def map_loopy_function_identifier(self, expr, *args, **kwargs): return set() @@ -374,6 +398,9 @@ class DependencyMapper(DependencyMapperBase): def map_resolved_function(self, expr): return self.rec(expr.function) + def map_literal(self, expr): + return set() + class SubstitutionRuleExpander(IdentityMapper): def __init__(self, rules): @@ -383,7 +410,7 @@ class SubstitutionRuleExpander(IdentityMapper): if expr.name in self.rules: return self.map_substitution(expr.name, self.rules[expr.name], ()) else: - return super(SubstitutionRuleExpander, self).map_variable(expr) + return super().map_variable(expr) def map_call(self, expr): if expr.function.name in self.rules: @@ -392,7 +419,7 @@ class SubstitutionRuleExpander(IdentityMapper): self.rules[expr.function.name], expr.parameters) else: - return super(SubstitutionRuleExpander, self).map_call(expr) + return super().map_call(expr) def map_substitution(self, name, rule, arguments): if len(rule.arguments) != len(arguments): @@ -429,7 +456,7 @@ class Literal(LoopyExpressionBase): .. note:: Only used in the output of - :mod:`loopy.target.c.expression.ExpressionToCExpressionMapper` (and + :mod:`loopy.target.c.codegen.expression.ExpressionToCExpressionMapper` (and similar mappers). Not for use in Loopy source representation. """ @@ -450,7 +477,7 @@ class ArrayLiteral(LoopyExpressionBase): .. note:: Only used in the output of - :mod:`loopy.target.c.expression.ExpressionToCExpressionMapper` (and + :mod:`loopy.target.c.codegen.expression.ExpressionToCExpressionMapper` (and similar mappers). Not for use in Loopy source representation. """ @@ -511,7 +538,7 @@ class TypedCSE(LoopyExpressionBase, p.CommonSubexpression): """ def __init__(self, child, prefix=None, dtype=None): - super(TypedCSE, self).__init__(child, prefix) + super().__init__(child, prefix) self.dtype = dtype def __getinitargs__(self): @@ -527,7 +554,7 @@ class TypeAnnotation(LoopyExpressionBase): """ def __init__(self, type, child): - super(TypeAnnotation, self).__init__() + super().__init__() self.type = type self.child = child @@ -547,7 +574,7 @@ class TypeCast(LoopyExpressionBase): """ def __init__(self, type, child): - super(TypeCast, self).__init__() + super().__init__() from loopy.types import to_loopy_type, NumpyType type = to_loopy_type(type) @@ -587,7 +614,7 @@ class TaggedVariable(LoopyExpressionBase, p.Variable): init_arg_names = ("name", "tag") def __init__(self, name, tag): - super(TaggedVariable, self).__init__(name) + super().__init__(name) self.tag = tag def __getinitargs__(self): @@ -597,8 +624,8 @@ class TaggedVariable(LoopyExpressionBase, p.Variable): class Reduction(LoopyExpressionBase): - """Represents a reduction operation on :attr:`exprs` - across :attr:`inames`. + """ + Represents a reduction operation on :attr:`expr` across :attr:`inames`. .. attribute:: operation an instance of :class:`loopy.library.reduction.ReductionOperation` @@ -612,9 +639,9 @@ class Reduction(LoopyExpressionBase): An expression which may have tuple type. If the expression has tuple type, it must be one of the following: - * a :class:`tuple` of :class:`pymbolic.primitives.Expression`, or - * a :class:`loopy.symbolic.Reduction`, or - * a function call or substitution rule invocation. + * a :class:`tuple` of :class:`pymbolic.primitives.Expression`, or + * a :class:`loopy.symbolic.Reduction`, or + * a function call or substitution rule invocation. .. attribute:: allow_simultaneous @@ -813,7 +840,7 @@ class SweptInameStrideCollector(CoefficientCollectorBase): or expr.aggregate.name not in self.target_names): return {1: expr} - return super(SweptInameStrideCollector, self).map_algebraic_leaf(expr) + return super().map_algebraic_leaf(expr) def get_start_subscript_from_sar(sar, kernel): @@ -981,11 +1008,11 @@ def rename_subst_rules_in_instructions(insns, renames): for insn in insns] -class SubstitutionRuleMappingContext(object): +class SubstitutionRuleMappingContext: def _get_subst_rule_key(self, args, body): - subst_dict = dict( - (arg, RuleArgument(i)) - for i, arg in enumerate(args)) + subst_dict = { + arg: RuleArgument(i) + for i, arg in enumerate(args)} from pymbolic.mapper.substitutor import make_subst_func arg_subst_map = SubstitutionMapper(make_subst_func(subst_dict)) @@ -997,10 +1024,10 @@ class SubstitutionRuleMappingContext(object): self.make_unique_var_name = make_unique_var_name # maps subst rule (args, bodies) to (names, original_name) - self.subst_rule_registry = dict( - (self._get_subst_rule_key(rule.arguments, rule.expression), - (name, rule.arguments, rule.expression)) - for name, rule in six.iteritems(old_subst_rules)) + self.subst_rule_registry = { + self._get_subst_rule_key(rule.arguments, rule.expression): + (name, rule.arguments, rule.expression) + for name, rule in old_subst_rules.items()} # maps subst rule (args, bodies) to a list of old names, # which doubles as (a) a histogram of uses and (b) a way @@ -1049,8 +1076,7 @@ class SubstitutionRuleMappingContext(object): used_names = set() - for key, (name, args, body) in six.iteritems( - self.subst_rule_registry): + for key, (name, args, body) in self.subst_rule_registry.items(): orig_names = self.subst_rule_old_names.get(key, []) # If no orig_names are found, then this particular @@ -1077,7 +1103,7 @@ class SubstitutionRuleMappingContext(object): subst_renamer = SubstitutionRuleRenamer(renames) renamed_result = {} - for name, rule in six.iteritems(result): + for name, rule in result.items(): renamed_result[name] = rule.copy( expression=subst_renamer(rule.expression)) @@ -1122,7 +1148,7 @@ class RuleAwareIdentityMapper(IdentityMapper): name, tag = parse_tagged_name(expr.function) if name not in self.rule_mapping_context.old_subst_rules: - return super(RuleAwareIdentityMapper, self).map_call(expr, expn_state) + return super().map_call(expr, expn_state) else: return self.map_substitution(name, tag, self.rec( expr.parameters, expn_state), expn_state) @@ -1135,9 +1161,9 @@ class RuleAwareIdentityMapper(IdentityMapper): from pymbolic.mapper.substitutor import make_subst_func arg_subst_map = SubstitutionMapper(make_subst_func(arg_context)) - return dict( - (formal_arg_name, arg_subst_map(arg_value)) - for formal_arg_name, arg_value in zip(arg_names, arguments)) + return { + formal_arg_name: arg_subst_map(arg_value) + for formal_arg_name, arg_value in zip(arg_names, arguments)} def map_substitution(self, name, tag, arguments, expn_state): rule = self.rule_mapping_context.old_subst_rules[name] @@ -1189,7 +1215,8 @@ class RuleAwareIdentityMapper(IdentityMapper): # may perform tasks entirely unrelated to subst rules, so # we must map assignees, too. self.map_instruction(kernel, - insn.with_transformed_expressions(self, kernel, insn)) + insn.with_transformed_expressions( + lambda expr: self(expr, kernel, insn))) for insn in kernel.instructions] return kernel.copy(instructions=new_insns) @@ -1197,7 +1224,7 @@ class RuleAwareIdentityMapper(IdentityMapper): class RuleAwareSubstitutionMapper(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, subst_func, within): - super(RuleAwareSubstitutionMapper, self).__init__(rule_mapping_context) + super().__init__(rule_mapping_context) self.subst_func = subst_func self.within = within @@ -1206,20 +1233,20 @@ class RuleAwareSubstitutionMapper(RuleAwareIdentityMapper): if (expr.name in expn_state.arg_context or not self.within( expn_state.kernel, expn_state.instruction, expn_state.stack)): - return super(RuleAwareSubstitutionMapper, self).map_variable( + return super().map_variable( expr, expn_state) result = self.subst_func(expr) if result is not None: return result else: - return super(RuleAwareSubstitutionMapper, self).map_variable( + return super().map_variable( expr, expn_state) class RuleAwareSubstitutionRuleExpander(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, rules, within): - super(RuleAwareSubstitutionRuleExpander, self).__init__(rule_mapping_context) + super().__init__(rule_mapping_context) self.rules = rules self.within = within @@ -1252,7 +1279,7 @@ class RuleAwareSubstitutionRuleExpander(RuleAwareIdentityMapper): else: # do not expand - return super(RuleAwareSubstitutionRuleExpander, self).map_substitution( + return super().map_substitution( name, tag, arguments, expn_state) # }}} @@ -1433,7 +1460,7 @@ class LoopyParser(ParserBase): return SubArrayRef(swept_inames, subscript) else: - return super(LoopyParser, self).parse_prefix(pstate) + return super().parse_prefix(pstate) def parse_postfix(self, pstate, min_precedence, left_exp): from pymbolic.parser import _PREC_CALL, _closebracket @@ -1493,7 +1520,7 @@ class ArrayAccessFinder(CombineMapper): if self.tgt_vector_name is None \ or expr.aggregate.name == self.tgt_vector_name: - return set([expr]) | self.rec(expr.index) + return {expr} | self.rec(expr.index) else: return CombineMapper.map_subscript(self, expr) @@ -1571,7 +1598,7 @@ class PwAffEvaluationMapper(EvaluationMapperBase, IdentityMapperMixin): self.zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(space)) context = {} - for name, (dt, pos) in six.iteritems(space.get_var_dict()): + for name, (dt, pos) in space.get_var_dict().items(): if dt == dim_type.set: dt = dim_type.in_ @@ -1583,7 +1610,7 @@ class PwAffEvaluationMapper(EvaluationMapperBase, IdentityMapperMixin): self.pw_zero = isl.PwAff.from_aff(self.zero) - super(PwAffEvaluationMapper, self).__init__(context) + super().__init__(context) def map_constant(self, expr): if isinstance(expr, np.integer): @@ -1624,6 +1651,10 @@ class PwAffEvaluationMapper(EvaluationMapperBase, IdentityMapperMixin): return num.mod_val(denom) + def map_literal(self, expr): + raise TypeError("literal '%s' not supported " + "for as-pwaff evaluation" % expr) + def aff_from_expr(space, expr, vars_to_zero=None): if vars_to_zero is None: @@ -1682,6 +1713,56 @@ def guarded_pwaff_from_expr(space, expr, vars_to_zero=None): # }}} +# {{{ (pw_)?qpoly_from_expr + +class PwQPolyEvaluationMapper(EvaluationMapperBase): + def __init__(self, space, vars_to_zero): + zero_qpoly = isl.QPolynomial.zero_on_domain(space) + + context = {} + for name, (dt, pos) in space.get_var_dict().items(): + if dt == dim_type.set: + dt = dim_type.in_ + + context[name] = isl.PwQPolynomial.from_qpolynomial( + isl.QPolynomial.var_on_domain(space, dt, pos)) + + for v in vars_to_zero: + context[v] = zero_qpoly + + self.pw_zero = isl.PwQPolynomial.from_qpolynomial(zero_qpoly) + + super().__init__(context) + + def map_constant(self, expr): + if isinstance(expr, np.integer): + expr = int(expr) + + return self.pw_zero + expr + + def map_quotient(self, expr): + raise TypeError("true division in '%s' not supported " + "for as-pwqpoly evaluation" % expr) + + +def pw_qpolynomial_from_expr(space, expr, vars_to_zero=frozenset()): + return PwQPolyEvaluationMapper(space, vars_to_zero)(expr) + + +def qpolynomial_from_expr(space, expr): + pw_qpoly = pw_qpolynomial_from_expr(space, expr).coalesce() + + pieces = pw_qpoly.get_pieces() + if len(pieces) == 1: + (s, qpoly), = pieces + return qpoly + else: + raise RuntimeError("expression '%s' could not be converted to a " + "non-piecewise quasi-polynomial expression" % expr) + +# }}} + + # {{{ simplify using aff # FIXME: redundant with simplify_via_aff @@ -1716,9 +1797,8 @@ def _term_to_expr(space, term): result = result*Variable(space.get_dim_name(dt, i))**exp for i in range(term.dim(dim_type.div)): - raise NotImplementedError("divs in terms") - # FIXME print the qpoly, match the semantics - result += aff_to_expr(term.get_div(i)) + exp = term.get_exp(dim_type.div, i) + result *= (aff_to_expr(term.get_div(i))**exp) return result @@ -1751,6 +1831,101 @@ def constraint_to_cond_expr(cns): # }}} +# {{{ isl_set_from_expr + +class ConditionExpressionToBooleanOpsExpression(IdentityMapper): + """ + Mapper to convert expressions into composition of boolean operation nodes + according to C-semantics. + + For ex.: + - ``i`` becomes ``i != 0`` + - ``i>10 and j`` becomes ``i>10 and j!=0`` + """ + + def map_comparison(self, expr): + return expr + + def _get_expr_neq_0(self, expr): + return p.Comparison(expr, "!=", 0) + + map_variable = _get_expr_neq_0 + map_subscript = _get_expr_neq_0 + map_sum = _get_expr_neq_0 + map_product = _get_expr_neq_0 + map_constant = _get_expr_neq_0 + map_call = _get_expr_neq_0 + map_power = _get_expr_neq_0 + map_power = _get_expr_neq_0 + + +class AffineConditionToISLSetMapper(IdentityMapper): + """ + Mapper to convert a condition :class:`~pymbolic.primitives.Expression` to a + :class:`~islpy.Set`. + """ + + def __init__(self, space): + self.space = space + super().__init__() + + def map_comparison(self, expr): + if expr.operator == "!=": + return self.rec(p.LogicalNot(p.Comparison(expr.left, "==", expr.right))) + + left_aff = guarded_aff_from_expr(self.space, expr.left) + right_aff = guarded_aff_from_expr(self.space, expr.right) + + if expr.operator == "==": + cnst = isl.Constraint.equality_from_aff(left_aff-right_aff) + elif expr.operator == ">=": + cnst = isl.Constraint.inequality_from_aff(left_aff-right_aff) + elif expr.operator == ">": + cnst = isl.Constraint.inequality_from_aff(left_aff-right_aff-1) + elif expr.operator == "<=": + cnst = isl.Constraint.inequality_from_aff(right_aff-left_aff) + elif expr.operator == "<": + cnst = isl.Constraint.inequality_from_aff(right_aff-left_aff-1) + else: + assert False + + return isl.Set.universe(self.space).add_constraint(cnst) + + def _map_logical_reduce(self, expr, f): + """ + :arg f: Reduction callable. + """ + sets = [self.rec(child) for child in expr.children] + return reduce(f, sets) + + def map_logical_or(self, expr): + import operator + return self._map_logical_reduce(expr, operator.or_) + + def map_logical_and(self, expr): + import operator + return self._map_logical_reduce(expr, operator.and_) + + def map_logical_not(self, expr): + set_ = self.rec(expr.child) + return set_.complement() + + +def isl_set_from_expr(space, expr): + """ + :arg expr: An instance of :class:`pymbolic.primitives.Expression` whose + boolean value is evaluated according to C-semantics. + """ + mapper = AffineConditionToISLSetMapper(space) + expr = ConditionExpressionToBooleanOpsExpression()(expr) + set_ = mapper(expr) + assert isinstance(set_, isl.Set) + + return set_ + +# }}} + + # {{{ set_to_cond_expr def basic_set_to_cond_expr(isl_basicset): @@ -1881,9 +2056,11 @@ class UnableToDetermineAccessRange(Exception): pass -def get_access_range(domain, subscript, assumptions, shape=None, +def get_access_range(domain, subscript, assumptions=None, shape=None, allowed_constant_names=None): """ + :arg assumptions: An instance of :class:`islpy.BasicSet` or *None*. *None* + is equivalent to the universal set over *domain*'s space. :arg shape: if not *None*, indicates that it is desired to return an overestimate of the access range based on the shape if a precise range cannot be determined. @@ -1891,10 +2068,11 @@ def get_access_range(domain, subscript, assumptions, shape=None, permitted in the access range expressions. Names that are already parameters of *domain* may be repeated without ill effects. """ - domain, assumptions = isl.align_two(domain, - assumptions) - domain = domain & assumptions - del assumptions + if assumptions is not None: + domain, assumptions = isl.align_two(domain, + assumptions) + domain = domain & assumptions + del assumptions dims = len(subscript) @@ -1908,9 +2086,9 @@ def get_access_range(domain, subscript, assumptions, shape=None, access_map = isl.Set.from_basic_set(access_map) if allowed_constant_names is not None: - allowed_constant_names = set(allowed_constant_names) - set( + allowed_constant_names = set(allowed_constant_names) - { access_map.get_dim_name(dim_type.param, i) - for i in range(access_map.dim(dim_type.param))) + for i in range(access_map.dim(dim_type.param))} par_base = access_map.dim(dim_type.param) access_map = access_map.insert_dims(dim_type.param, par_base, @@ -2045,11 +2223,11 @@ class BatchedAccessRangeMapper(WalkMapper): return self.rec(expr.child, inames) def map_sub_array_ref(self, expr, inames): - total_inames = inames | set([iname.name for iname in expr.swept_inames]) + total_inames = inames | {iname.name for iname in expr.swept_inames} return self.rec(expr.subscript, total_inames) -class AccessRangeMapper(object): +class AccessRangeMapper: """**IMPORTANT** Using this class *will likely* lead to performance bottlenecks. @@ -2084,7 +2262,7 @@ class AccessRangeMapper(object): # {{{ check if access ranges overlap -class AccessRangeOverlapChecker(object): +class AccessRangeOverlapChecker: """Used for checking for overlap between access ranges of instructions.""" def __init__(self, kernel): @@ -2108,7 +2286,7 @@ class AccessRangeOverlapChecker(object): for expr in exprs: arm(expr, self.kernel.insn_inames(insn)) - for name, arange in six.iteritems(arm.access_ranges): + for name, arange in arm.access_ranges.items(): if arm.bad_subscripts[name]: aranges[name] = True continue diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 20220d41838783389da955f78773ca18b67d04c6..a05bc66a22e8a96919c2dd0af5cc1e4c1166e710 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -1,6 +1,5 @@ """Base target interface.""" -from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2015 Andreas Kloeckner" @@ -30,7 +29,6 @@ __doc__ = """ .. autoclass:: TargetBase .. autoclass:: ASTBuilderBase - .. autoclass:: CFamilyTarget .. autoclass:: CTarget .. autoclass:: ExecutableCTarget @@ -44,7 +42,7 @@ __doc__ = """ """ -class TargetBase(object): +class TargetBase: """Base class for all targets, i.e. different combinations of code that loopy can generate. @@ -142,7 +140,7 @@ class TargetBase(object): raise NotImplementedError() -class ASTBuilderBase(object): +class ASTBuilderBase: """An interface for generating (host or device) ASTs. """ @@ -249,14 +247,14 @@ class ASTBuilderBase(object): # {{{ dummy host ast builder -class _DummyExpressionToCodeMapper(object): +class _DummyExpressionToCodeMapper: def rec(self, expr, prec, type_context=None, needed_dtype=None): return "" __call__ = rec -class _DummyASTBlock(object): +class _DummyASTBlock: def __init__(self, arg): self.contents = [] diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index c8aa041da632b7d6896376761117b416dd56eade..37997d7abeb6b22a304f6160af28a214eaf4c50d 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -1,6 +1,5 @@ """Plain C target and base for other C-family languages.""" -from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2015 Andreas Kloeckner" @@ -24,8 +23,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six - import numpy as np # noqa from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder from loopy.diagnostic import LoopyError, LoopyTypeError @@ -39,10 +36,20 @@ import pymbolic.primitives as p from pytools import memoize_method +__doc__ = """ +.. currentmodule loopy.target.c + +.. autoclass:: POD + +.. autoclass:: ScopingBlock + +.. automodule:: loopy.target.c.codegen.expression +""" + # {{{ dtype registry wrapper -class DTypeRegistryWrapper(object): +class DTypeRegistryWrapper: def __init__(self, wrapped_registry): self.wrapped_registry = wrapped_registry @@ -82,6 +89,11 @@ class DTypeRegistryWrapper(object): def c99_preamble_generator(preamble_info): if any(dtype.is_integral() for dtype in preamble_info.seen_dtypes): yield("10_stdint", "#include ") + if any(dtype.numpy_dtype == np.dtype("bool") + for dtype in preamble_info.seen_dtypes): + yield("10_stdbool", "#include ") + if any(dtype.is_complex() for dtype in preamble_info.seen_dtypes): + yield("10_complex", "#include ") def _preamble_generator(preamble_info): @@ -153,9 +165,9 @@ def _preamble_generator(preamble_info): """, } - c_funcs = set(func.c_name for func in preamble_info.seen_functions) + c_funcs = {func.c_name for func in preamble_info.seen_functions} - for func_name, func_body in six.iteritems(function_defs): + for func_name, func_body in function_defs.items(): if any((func_name + "_" + tpname) in c_funcs for tpname in integer_type_names): yield def_integer_types_macro @@ -204,7 +216,7 @@ class POD(Declarator): class ScopingBlock(Block): """A block that is mandatory for scoping and may not be simplified away - by :func:`loopy.codegen.results.merge_codegen_results`. + by :func:`loopy.codegen.result.merge_codegen_results`. """ @@ -248,8 +260,7 @@ def generate_linearized_array(array, value): assert array.offset == 0 - from pytools import indices_in_shape - for ituple in indices_in_shape(value.shape): + for ituple in np.ndindex(value.shape): i = sum(i_ax * strd_ax for i_ax, strd_ax in zip(ituple, strides)) data[i] = value[ituple] @@ -310,7 +321,7 @@ class ASTSubscriptCollector(CASTIdentityMapper): # {{{ lazy expression generation -class CExpression(object): +class CExpression: def __init__(self, to_code_mapper, expr): self.to_code_mapper = to_code_mapper self.expr = expr @@ -332,7 +343,7 @@ class CFamilyTarget(TargetBase): def __init__(self, fortran_abi=False): self.fortran_abi = fortran_abi - super(CFamilyTarget, self).__init__() + super().__init__() def split_kernel_at_global_barriers(self): return False @@ -425,7 +436,7 @@ class CMathCallable(ScalarCallable): for id in arg_id_to_dtype: if not -1 <= id <= 0: - raise LoopyError("%s can take only one argument." % name) + raise LoopyError(f"'{name}' can take only one argument.") if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: # the types provided aren't mature enough to specialize the @@ -437,11 +448,11 @@ class CMathCallable(ScalarCallable): dtype = arg_id_to_dtype[0] dtype = dtype.numpy_dtype - if dtype.kind in ('u', 'i'): + if dtype.kind in ("u", "i"): # ints and unsigned casted to float32 dtype = np.float32 - elif dtype.kind == 'c': - raise LoopyTypeError("%s does not support type %s" % (name, dtype)) + elif dtype.kind == "c": + raise LoopyTypeError(f"{name} does not support type {dtype}") from loopy.target.opencl import OpenCLTarget if not isinstance(caller_kernel.target, OpenCLTarget): @@ -453,7 +464,7 @@ class CMathCallable(ScalarCallable): elif dtype == np.float128: # pylint:disable=no-member name = name + "l" # fabsl else: - raise LoopyTypeError("%s does not support type %s" % (name, + raise LoopyTypeError("{} does not support type {}".format(name, dtype)) return ( @@ -530,20 +541,19 @@ class CFamilyASTBuilder(ASTBuilderBase): def symbol_manglers(self): return ( - super(CFamilyASTBuilder, self).symbol_manglers() + [ + super().symbol_manglers() + [ c_symbol_mangler ]) def preamble_generators(self): return ( - super(CFamilyASTBuilder, self).preamble_generators() + [ + super().preamble_generators() + [ _preamble_generator, ]) def function_id_in_knl_callable_mapper(self): return ( - super(CFamilyASTBuilder, - self).function_id_in_knl_callable_mapper() + [ + super().function_id_in_knl_callable_mapper() + [ scope_c_math_functions]) # }}} @@ -577,7 +587,7 @@ class CFamilyASTBuilder(ASTBuilderBase): break if is_first_dev_prog: for tv in sorted( - six.itervalues(kernel.temporary_variables), + kernel.temporary_variables.values(), key=lambda tv: tv.name): if tv.address_space == AddressSpace.GLOBAL and ( @@ -671,7 +681,7 @@ class CFamilyASTBuilder(ASTBuilderBase): | temporaries_written_in_subkernel(kernel, subkernel)) for tv in sorted( - six.itervalues(kernel.temporary_variables), + kernel.temporary_variables.values(), key=lambda tv: tv.name): decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype) @@ -734,7 +744,7 @@ class CFamilyASTBuilder(ASTBuilderBase): cast_tp, cast_d = cast_decl.get_decl_pair() temp_var_decl = Initializer( temp_var_decl, - "(%s %s) (%s + %s)" % ( + "({} {}) ({} + {})".format( " ".join(cast_tp), cast_d, tv.base_storage, offset)) @@ -748,7 +758,7 @@ class CFamilyASTBuilder(ASTBuilderBase): ecm = self.get_expression_to_code_mapper(codegen_state) - for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)): + for bs_name, bs_sizes in sorted(base_storage_sizes.items()): bs_var_decl = Value("char", bs_name) from pytools import single_valued bs_var_decl = self.wrap_temporary_decl( @@ -957,7 +967,7 @@ class CFamilyASTBuilder(ASTBuilderBase): in_knl_callable = codegen_state.callables_table[func_id] if isinstance(in_knl_callable, ScalarCallable) and ( - in_knl_callable.name_in_target == 'loopy_make_tuple'): + in_knl_callable.name_in_target == "loopy_make_tuple"): return self.emit_tuple_assignment(codegen_state, insn) # takes "is_returned" to infer whether insn.assignees[0] is a part of @@ -1046,7 +1056,7 @@ class CFunctionDeclExtractor(CASTIdentityMapper): def map_function_decl_wrapper(self, node): self.decls.append(node.subdecl) - return super(CFunctionDeclExtractor, self)\ + return super()\ .map_function_decl_wrapper(node) @@ -1054,7 +1064,7 @@ def generate_header(kernel, codegen_result=None): """ :arg kernel: a :class:`loopy.LoopKernel` :arg codegen_result: an instance of :class:`loopy.CodeGenerationResult` - :returns: a list of AST nodes (which may have :func:`str` + :returns: a list of AST nodes (which may have :class:`str` called on them to produce a string) representing function declarations for the generated device functions. @@ -1062,7 +1072,7 @@ def generate_header(kernel, codegen_result=None): if not isinstance(kernel.target, CFamilyTarget): raise LoopyError( - 'Header generation for non C-based languages are not implemented') + "Header generation for non C-based languages are not implemented") if codegen_result is None: from loopy.codegen import generate_code_v2 @@ -1091,16 +1101,18 @@ class CTarget(CFamilyTarget): @memoize_method def get_dtype_registry(self): from loopy.target.c.compyte.dtypes import ( - DTypeRegistry, fill_registry_with_c99_stdint_types) + DTypeRegistry, fill_registry_with_c99_stdint_types, + fill_registry_with_c99_complex_types) result = DTypeRegistry() fill_registry_with_c99_stdint_types(result) + fill_registry_with_c99_complex_types(result) return DTypeRegistryWrapper(result) class CASTBuilder(CFamilyASTBuilder): def preamble_generators(self): return ( - super(CASTBuilder, self).preamble_generators() + [ + super().preamble_generators() + [ c99_preamble_generator, ]) @@ -1115,7 +1127,7 @@ class ExecutableCTarget(CTarget): """ def __init__(self, compiler=None, fortran_abi=False): - super(ExecutableCTarget, self).__init__(fortran_abi=fortran_abi) + super().__init__(fortran_abi=fortran_abi) from loopy.target.c.c_execution import CCompiler self.compiler = compiler or CCompiler() diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index b6525b5d183c803955317a92ed10d8206eecba65..2031b3703f1e431c2b3a1979282af85e4b167edd 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -1,5 +1,3 @@ -from __future__ import division, with_statement, absolute_import - __copyright__ = "Copyright (C) 2017 Nick Curtis" __license__ = """ @@ -32,7 +30,6 @@ from pytools.py_codegen import (Indentation) from pytools.prefork import ExecError from codepy.toolchain import guess_toolchain, ToolchainGuessError, GCCToolchain from codepy.jit import compile_from_string -import six import ctypes import numpy as np @@ -49,12 +46,12 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): def __init__(self): system_args = ["_lpy_c_kernels"] - super(CExecutionWrapperGenerator, self).__init__(system_args) + super().__init__(system_args) def python_dtype_str(self, dtype): if np.dtype(str(dtype)).isbuiltin: return "_lpy_np."+dtype.name - raise Exception('dtype: {0} not recognized'.format(dtype)) + raise Exception(f"dtype: {dtype} not recognized") # {{{ handle non numpy arguements @@ -110,7 +107,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): var("_lpy_expected_strides_%s" % i) for i in range(num_axes)) - gen("%s = %s.strides" % (strify(expected_strides), arg.name)) + gen("{} = {}.strides".format(strify(expected_strides), arg.name)) #check strides if not skip_arg_checks: @@ -149,7 +146,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): kernel, implemented_data_info): gen("for knl in _lpy_c_kernels:") with Indentation(gen): - gen('knl({args})'.format( + gen("knl({args})".format( args=", ".join(args))) # }}} @@ -163,7 +160,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): if options.return_dict: gen("return None, {%s}" - % ", ".join("\"%s\": %s" % (arg.name, arg.name) + % ", ".join(f'"{arg.name}": {arg.name}' for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) if arg.base_name in @@ -191,7 +188,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): return arg.name -class CCompiler(object): +class CCompiler: """ The compiler module handles invocation of compilers to generate a shared lib using codepy, which can subsequently be loaded via ctypes. @@ -212,10 +209,10 @@ class CCompiler(object): """ def __init__(self, toolchain=None, - cc='gcc', cflags='-std=c99 -O3 -fPIC'.split(), - ldflags='-shared'.split(), libraries=[], + cc="gcc", cflags="-std=c99 -O3 -fPIC".split(), + ldflags="-shared".split(), libraries=[], include_dirs=[], library_dirs=[], defines=[], - source_suffix='c'): + source_suffix="c"): # try to get a default toolchain # or subclass supplied version if available self.toolchain = toolchain @@ -226,36 +223,36 @@ class CCompiler(object): # missing compiler python was built with (likely, Conda) # use a default GCCToolchain logger = logging.getLogger(__name__) - logger.warn('Default toolchain guessed from python config ' - 'not found, replacing with default GCCToolchain.') + logger.warn("Default toolchain guessed from python config " + "not found, replacing with default GCCToolchain.") # this is ugly, but I'm not sure there's a clean way to copy the # default args self.toolchain = GCCToolchain( - cc='gcc', - cflags='-std=c99 -O3 -fPIC'.split(), - ldflags='-shared'.split(), + cc="gcc", + cflags="-std=c99 -O3 -fPIC".split(), + ldflags="-shared".split(), libraries=[], library_dirs=[], defines=[], undefines=[], - source_suffix='c', - so_ext='.so', - o_ext='.o', + source_suffix="c", + so_ext=".so", + o_ext=".o", include_dirs=[]) if toolchain is None: # copy in all differing values - diff = {'cc': cc, - 'cflags': cflags, - 'ldflags': ldflags, - 'libraries': libraries, - 'include_dirs': include_dirs, - 'library_dirs': library_dirs, - 'defines': defines} + diff = {"cc": cc, + "cflags": cflags, + "ldflags": ldflags, + "libraries": libraries, + "include_dirs": include_dirs, + "library_dirs": library_dirs, + "defines": defines} # filter empty and those equal to toolchain defaults - diff = dict((k, v) for k, v in six.iteritems(diff) + diff = {k: v for k, v in diff.items() if v and (not hasattr(self.toolchain, k) or - getattr(self.toolchain, k) != v)) + getattr(self.toolchain, k) != v)} self.toolchain = self.toolchain.copy(**diff) self.tempdir = tempfile.mkdtemp(prefix="tmp_loopy") self.source_suffix = source_suffix @@ -268,7 +265,7 @@ class CCompiler(object): debug_recompile=True): """Compile code, build and load shared library.""" logger.debug(code) - c_fname = self._tempname('code.' + self.source_suffix) + c_fname = self._tempname("code." + self.source_suffix) # build object _, mod_name, ext_file, recompiled = \ @@ -277,9 +274,9 @@ class CCompiler(object): debug_recompile, False) if recompiled: - logger.debug('Kernel {0} compiled from source'.format(name)) + logger.debug(f"Kernel {name} compiled from source") else: - logger.debug('Kernel {0} retrieved from cache'.format(name)) + logger.debug(f"Kernel {name} retrieved from cache") # and return compiled return ctypes.CDLL(ext_file) @@ -289,18 +286,18 @@ class CPlusPlusCompiler(CCompiler): """Subclass of CCompiler to invoke a C++ compiler.""" def __init__(self, toolchain=None, - cc='g++', cflags='-std=c++98 -O3 -fPIC'.split(), + cc="g++", cflags="-std=c++98 -O3 -fPIC".split(), ldflags=[], libraries=[], include_dirs=[], library_dirs=[], defines=[], - source_suffix='cpp'): + source_suffix="cpp"): - super(CPlusPlusCompiler, self).__init__( + super().__init__( toolchain=toolchain, cc=cc, cflags=cflags, ldflags=ldflags, libraries=libraries, include_dirs=include_dirs, library_dirs=library_dirs, defines=defines, source_suffix=source_suffix) -class IDIToCDLL(object): +class IDIToCDLL: """ A utility class that extracts arguement and return type info from a :class:`ImplementedDataInfo` in order to create a :class:`ctype.CDLL` @@ -323,14 +320,14 @@ class IDIToCDLL(object): def _dtype_to_ctype(self, dtype, pointer=False): """Map NumPy dtype to equivalent ctypes type.""" typename = self.registry.dtype_to_ctype(dtype) - typename = {'unsigned': 'uint'}.get(typename, typename) - basetype = getattr(ctypes, 'c_' + typename) + typename = {"unsigned": "uint"}.get(typename, typename) + basetype = getattr(ctypes, "c_" + typename) if pointer: return ctypes.POINTER(basetype) return basetype -class CompiledCKernel(object): +class CompiledCKernel: """ A CompiledCKernel wraps a loopy kernel, compiling it and loading the result as a shared library, and provides access to the kernel as a @@ -360,7 +357,7 @@ class CompiledCKernel(object): """Execute kernel with given args mapped to ctypes equivalents.""" args_ = [] for arg, arg_t in zip(args, self._fn.argtypes): - if hasattr(arg, 'ctypes'): + if hasattr(arg, "ctypes"): if arg.size == 0: # TODO eliminate unused arguments from kernel arg_ = arg_t(0.0) @@ -389,12 +386,15 @@ class CKernelExecutor(KernelExecutorBase): """ self.compiler = compiler if compiler else CCompiler() - super(CKernelExecutor, self).__init__(program) + super().__init__(program) def get_invoker_uncached(self, kernel, codegen_result): generator = CExecutionWrapperGenerator() return generator(kernel, codegen_result) + def get_wrapper_generator(self): + return CExecutionWrapperGenerator() + @memoize_method def program_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): program = self.get_typed_and_scheduled_program(arg_to_dtype_set) @@ -404,7 +404,7 @@ class CKernelExecutor(KernelExecutorBase): dev_code = codegen_result.device_code() host_code = codegen_result.host_code() - all_code = '\n'.join([dev_code, '', host_code]) + all_code = "\n".join([dev_code, "", host_code]) if self.program.root_kernel.options.write_cl: output = all_code @@ -421,7 +421,7 @@ class CKernelExecutor(KernelExecutorBase): from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.c") # update code from editor - all_code = '\n'.join([dev_code, '', host_code]) + all_code = "\n".join([dev_code, "", host_code]) c_kernels = [] for dp in codegen_result.device_programs: diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index b0bc187ebe71c2e9751ce95abe0050b5c06d6f26..046dfa455eb0b7bcf7015f17758650941f4e1ce6 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -23,8 +21,6 @@ THE SOFTWARE. """ -from six.moves import range - import numpy as np from pymbolic.mapper import RecursiveMapper, IdentityMapper @@ -44,11 +40,23 @@ from loopy.type_inference import TypeInferenceMapper from loopy.diagnostic import LoopyError from loopy.tools import is_integer from loopy.types import LoopyType +from loopy.target.c import CExpression + + +__doc__ = """ +.. currentmodule:: loopy.target.c.codegen.expression + +.. autoclass:: ExpressionToCExpressionMapper +""" # {{{ Loopy expression to C expression mapper class ExpressionToCExpressionMapper(IdentityMapper): + """ + Mapper that converts a loopy-semantic expression to a C-semantic expression + with typecasts, appropriate arithmetic semantic mapping, etc. + """ def __init__(self, codegen_state, fortran_abi=False, type_inf_mapper=None): self.kernel = codegen_state.kernel self.codegen_state = codegen_state @@ -113,7 +121,6 @@ class ExpressionToCExpressionMapper(IdentityMapper): prec = PREC_NONE assert prec == PREC_NONE - from loopy.target.c import CExpression return CExpression( self.codegen_state.ast_builder.get_c_expression_to_code_mapper(), self.rec(expr, type_context, needed_dtype)) @@ -127,7 +134,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): if expr.name in self.codegen_state.var_subst_map: if self.kernel.options.annotate_inames: return var( - "/* %s */ %s" % ( + "/* {} */ {}".format( expr.name, self.rec(self.codegen_state.var_subst_map[expr.name], type_context))) @@ -173,7 +180,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): def map_subscript(self, expr, type_context): def base_impl(expr, type_context): - return self.rec(expr.aggregate, type_context)[self.rec(expr.index, 'i')] + return self.rec(expr.aggregate, type_context)[self.rec(expr.index, "i")] def make_var(name): from loopy import TaggedVariable @@ -221,7 +228,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): base_access = var("read_imagef")( var(ary.name), var("loopy_sampler"), - var("(%s)" % idx_vec_type)(*self.rec(idx_tuple, 'i'))) + var("(%s)" % idx_vec_type)(*self.rec(idx_tuple, "i"))) if ary.dtype.numpy_dtype == np.float32: return base_access.attr("x") @@ -236,8 +243,8 @@ class ExpressionToCExpressionMapper(IdentityMapper): elif isinstance(ary, (ArrayArg, TemporaryVariable, ConstantArg)): if len(access_info.subscripts) == 0: if ( - (isinstance(ary, (ConstantArg, ArrayArg)) or - (isinstance(ary, TemporaryVariable) and ary.base_storage))): + isinstance(ary, (ConstantArg, ArrayArg)) or + (isinstance(ary, TemporaryVariable) and ary.base_storage)): # unsubscripted global args are pointers result = self.make_subscript( ary, @@ -255,7 +262,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): ary, make_var(access_info.array_name), simplify_using_aff( - self.kernel, self.rec(subscript, 'i'))) + self.kernel, self.rec(subscript, "i"))) if access_info.vector_index is not None: return self.codegen_state.ast_builder.add_vector_access( @@ -290,7 +297,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): return self.make_subscript( arg, var(expr.aggregate.name), - self.rec(offset + expr.index, 'i')) + self.rec(offset + expr.index, "i")) elif expr.aggregate.name in self.kernel.temporary_variables: raise RuntimeError("linear indexing is not supported on temporaries: %s" @@ -323,7 +330,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): from loopy.codegen import SeenFunction self.codegen_state.seen_functions.add( SeenFunction( - name, "%s_%s" % (name, suffix), + name, f"{name}_{suffix}", (result_dtype, result_dtype))) if den_nonneg: @@ -333,14 +340,14 @@ class ExpressionToCExpressionMapper(IdentityMapper): self.rec(expr.denominator, type_context)) else: seen_func("%s_pos_b" % base_func_name) - return var("%s_pos_b_%s" % (base_func_name, suffix))( - self.rec(expr.numerator, 'i'), - self.rec(expr.denominator, 'i')) + return var(f"{base_func_name}_pos_b_{suffix}")( + self.rec(expr.numerator, "i"), + self.rec(expr.denominator, "i")) else: seen_func(base_func_name) - return var("%s_%s" % (base_func_name, suffix))( - self.rec(expr.numerator, 'i'), - self.rec(expr.denominator, 'i')) + return var(f"{base_func_name}_{suffix}")( + self.rec(expr.numerator, "i"), + self.rec(expr.denominator, "i")) def map_floor_div(self, expr, type_context): import operator @@ -605,8 +612,8 @@ class ExpressionToCExpressionMapper(IdentityMapper): if not self.allow_complex: return base_impl(expr, type_context) - n_complex = 'c' == n_dtype.kind - d_complex = 'c' == d_dtype.kind + n_complex = "c" == n_dtype.kind + d_complex = "c" == d_dtype.kind tgt_dtype = self.infer_type(expr) @@ -721,7 +728,7 @@ class CExpressionToCodeMapper(RecursiveMapper): func = self.rec(expr.function, PREC_CALL+1) return self.parenthesize_if_needed( - "%s(%s)" % ( + "{}({})".format( func, self.join_rec(", ", expr.parameters, PREC_NONE)), enclosing_prec, PREC_CALL) @@ -737,13 +744,13 @@ class CExpressionToCodeMapper(RecursiveMapper): def map_lookup(self, expr, enclosing_prec): return self.parenthesize_if_needed( - "%s.%s" % ( + "{}.{}".format( self.rec(expr.aggregate, PREC_CALL), expr.name), enclosing_prec, PREC_CALL) def map_subscript(self, expr, enclosing_prec): return self.parenthesize_if_needed( - "%s[%s]" % ( + "{}[{}]".format( self.rec(expr.aggregate, PREC_CALL+1), self.rec(expr.index, PREC_NONE)), enclosing_prec, PREC_CALL) @@ -755,7 +762,7 @@ class CExpressionToCodeMapper(RecursiveMapper): result = self.rec(children.pop(), PREC_NONE) while children: - result = "%s(%s, %s)" % (what, + result = "{}({}, {})".format(what, self.rec(children.pop(), PREC_NONE), result) @@ -765,7 +772,7 @@ class CExpressionToCodeMapper(RecursiveMapper): def map_if(self, expr, enclosing_prec): from pymbolic.mapper.stringifier import PREC_NONE - return "(%s ? %s : %s)" % ( + return "({} ? {} : {})".format( self.rec(expr.condition, PREC_NONE), self.rec(expr.then, PREC_NONE), self.rec(expr.else_, PREC_NONE), @@ -775,7 +782,7 @@ class CExpressionToCodeMapper(RecursiveMapper): from pymbolic.mapper.stringifier import PREC_COMPARISON return self.parenthesize_if_needed( - "%s %s %s" % ( + "{} {} {}".format( self.rec(expr.left, PREC_COMPARISON), expr.operator, self.rec(expr.right, PREC_COMPARISON)), @@ -860,7 +867,7 @@ class CExpressionToCodeMapper(RecursiveMapper): force_parens_around=self.multiplicative_primitives) return self.parenthesize_if_needed( - "%s %s %s" % ( + "{} {} {}".format( # Space is necessary--otherwise '/*' # (i.e. divide-dererference) becomes # start-of-comment in C. @@ -879,7 +886,7 @@ class CExpressionToCodeMapper(RecursiveMapper): return self._map_division_operator("%", expr, enclosing_prec) def map_power(self, expr, enclosing_prec): - return "pow(%s, %s)" % ( + return "pow({}, {})".format( self.rec(expr.base, PREC_NONE), self.rec(expr.exponent, PREC_NONE)) diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte index 25ee8b48fd0c7d9f0bd987c6862cdb1884fb1372..d1f993daecc03947d9e6e3e60d2a5145ecbf3786 160000 --- a/loopy/target/c/compyte +++ b/loopy/target/c/compyte @@ -1 +1 @@ -Subproject commit 25ee8b48fd0c7d9f0bd987c6862cdb1884fb1372 +Subproject commit d1f993daecc03947d9e6e3e60d2a5145ecbf3786 diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index d713e06c08b6a16962043103e9bde440011e5359..83697e60161a12a38fbc240d086e6b6bce4876b1 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -1,6 +1,5 @@ """CUDA target independent of PyCUDA.""" -from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2015 Andreas Kloeckner" @@ -59,18 +58,18 @@ def _create_vector_types(): vec.type_to_scalar_and_count = {} for base_name, base_type, counts in [ - ('char', np.int8, [1, 2, 3, 4]), - ('uchar', np.uint8, [1, 2, 3, 4]), - ('short', np.int16, [1, 2, 3, 4]), - ('ushort', np.uint16, [1, 2, 3, 4]), - ('int', np.int32, [1, 2, 3, 4]), - ('uint', np.uint32, [1, 2, 3, 4]), - ('long', long_dtype, [1, 2, 3, 4]), - ('ulong', ulong_dtype, [1, 2, 3, 4]), - ('longlong', np.int64, [1, 2]), - ('ulonglong', np.uint64, [1, 2]), - ('float', np.float32, [1, 2, 3, 4]), - ('double', np.float64, [1, 2]), + ("char", np.int8, [1, 2, 3, 4]), + ("uchar", np.uint8, [1, 2, 3, 4]), + ("short", np.int16, [1, 2, 3, 4]), + ("ushort", np.uint16, [1, 2, 3, 4]), + ("int", np.int32, [1, 2, 3, 4]), + ("uint", np.uint32, [1, 2, 3, 4]), + ("long", long_dtype, [1, 2, 3, 4]), + ("ulong", ulong_dtype, [1, 2, 3, 4]), + ("longlong", np.int64, [1, 2]), + ("ulonglong", np.uint64, [1, 2]), + ("float", np.float32, [1, 2, 3, 4]), + ("double", np.float64, [1, 2]), ]: for count in counts: name = "%s%d" % (base_name, count) @@ -171,8 +170,8 @@ class CudaCallable(ScalarCallable): raise LoopyError("%s does not support complex numbers" % name) - updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, - num_args)) + updated_arg_id_to_dtype = {id: NumpyType(dtype) for id in range(-1, + num_args)} return ( self.copy(name_in_target=name, @@ -185,7 +184,7 @@ class CudaCallable(ScalarCallable): def scope_cuda_functions(target, identifier): - if identifier in set(["dot"]) | set( + if identifier in {"dot"} | set( _CUDA_SPECIFIC_FUNCTIONS): return CudaCallable(name=identifier) @@ -209,12 +208,12 @@ class ExpressionToCudaCExpressionMapper(ExpressionToCExpressionMapper): raise LoopyError("unexpected index type") def map_group_hw_index(self, expr, type_context): - return var("((%s) blockIdx.%s)" % ( + return var("(({}) blockIdx.{})".format( self._get_index_ctype(self.kernel), self._GRID_AXES[expr.axis])) def map_local_hw_index(self, expr, type_context): - return var("((%s) threadIdx.%s)" % ( + return var("(({}) threadIdx.{})".format( self._get_index_ctype(self.kernel), self._GRID_AXES[expr.axis])) @@ -233,7 +232,7 @@ class CudaTarget(CFamilyTarget): """ self.extern_c = extern_c - super(CudaTarget, self).__init__() + super().__init__() def split_kernel_at_global_barriers(self): return True @@ -313,7 +312,7 @@ class CUDACASTBuilder(CFamilyASTBuilder): def function_id_in_knl_callable_mapper(self): return [scope_cuda_functions] + ( - super(CUDACASTBuilder, self).function_id_in_knl_callable_mapper()) + super().function_id_in_knl_callable_mapper()) # }}} @@ -321,7 +320,7 @@ class CUDACASTBuilder(CFamilyASTBuilder): def get_function_declaration(self, codegen_state, codegen_result, schedule_index): - fdecl = super(CUDACASTBuilder, self).get_function_declaration( + fdecl = super().get_function_declaration( codegen_state, codegen_result, schedule_index) from loopy.target.c import FunctionDeclarationWrapper @@ -356,7 +355,7 @@ class CUDACASTBuilder(CFamilyASTBuilder): def preamble_generators(self): return ( - super(CUDACASTBuilder, self).preamble_generators() + [ + super().preamble_generators() + [ cuda_preamble_generator]) # }}} @@ -456,7 +455,7 @@ class CUDACASTBuilder(CFamilyASTBuilder): lhs_expr_code = ecm(lhs_expr) rhs_expr_code = ecm(new_rhs_expr) - return Statement("atomicAdd(&{0}, {1})".format( + return Statement("atomicAdd(&{}, {})".format( lhs_expr_code, rhs_expr_code)) else: from cgen import Block, DoWhile, Assign diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 96f6e065c9cabccbe48071d6d4be10a059813cf3..1a98ffdc90eec2db9e83f014c07b3da1b0c8108e 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -1,5 +1,3 @@ -from __future__ import division, with_statement, absolute_import - __copyright__ = "Copyright (C) 2012-17 Andreas Kloeckner, Nick Curtis" __license__ = """ @@ -23,7 +21,6 @@ THE SOFTWARE. """ -import six import numpy as np from pytools import ImmutableRecord, memoize_method from loopy.diagnostic import LoopyError @@ -51,7 +48,7 @@ class _PackingInfo(ImmutableRecord): """ -class SeparateArrayPackingController(object): +class SeparateArrayPackingController: """For argument arrays with axes tagged to be implemented as separate arrays, this class provides preprocessing of the incoming arguments so that all sub-arrays may be passed in one object array (under the original, @@ -91,7 +88,7 @@ class SeparateArrayPackingController(object): kernel_kwargs = kernel_kwargs.copy() - for packing_info in six.itervalues(self.packing_info): + for packing_info in self.packing_info.values(): arg_name = packing_info.name if packing_info.name in kernel_kwargs: arg = kernel_kwargs[arg_name] @@ -106,7 +103,7 @@ class SeparateArrayPackingController(object): if not self.packing_info: return outputs - for packing_info in six.itervalues(self.packing_info): + for packing_info in self.packing_info.values(): if not packing_info.is_written: continue @@ -123,7 +120,7 @@ class SeparateArrayPackingController(object): # {{{ ExecutionWrapperGeneratorBase -class ExecutionWrapperGeneratorBase(object): +class ExecutionWrapperGeneratorBase: """ A set of common methods for generating a wrapper for execution @@ -195,12 +192,12 @@ class ExecutionWrapperGeneratorBase(object): gen("# {{{ find integer arguments from shapes") gen("") - for iarg_name, sources in six.iteritems(iarg_to_sources): + for iarg_name, sources in iarg_to_sources.items(): gen("if %s is None:" % iarg_name) with Indentation(gen): if_stmt = "if" for arg_name, value_expr in sources: - gen("%s %s is not None:" % (if_stmt, arg_name)) + gen(f"{if_stmt} {arg_name} is not None:") with Indentation(gen): gen("%s = %s" % (iarg_name, StringifyMapper()(value_expr))) @@ -236,7 +233,7 @@ class ExecutionWrapperGeneratorBase(object): gen("else:") with Indentation(gen): if not options.no_numpy: - gen("_lpy_offset = getattr(%s, \"offset\", 0)" + gen('_lpy_offset = getattr(%s, "offset", 0)' % impl_array_name) else: gen("_lpy_offset = %s.offset" % impl_array_name) @@ -248,7 +245,7 @@ class ExecutionWrapperGeneratorBase(object): % (arg.name, base_arg.dtype.itemsize)) gen("assert _lpy_remdr == 0, \"Offset of array '%s' is " - "not divisible by its dtype itemsize\"" + 'not divisible by its dtype itemsize"' % impl_array_name) gen("del _lpy_remdr") else: @@ -283,7 +280,7 @@ class ExecutionWrapperGeneratorBase(object): with Indentation(gen): gen("raise RuntimeError(\"required stride '%s' for " "argument '%s' not given or deducible from " - "passed array\")" + 'passed array")' % (arg.name, impl_array_name)) base_arg = program.impl_arg_to_arg[impl_array_name] @@ -294,7 +291,7 @@ class ExecutionWrapperGeneratorBase(object): base_arg.dtype.dtype.itemsize)) gen("assert _lpy_remdr == 0, \"Stride %d of array '%s' " - " is not divisible by its dtype itemsize\"" + ' is not divisible by its dtype itemsize"' % (stride_impl_axis, impl_array_name)) gen("del _lpy_remdr") else: @@ -326,7 +323,7 @@ class ExecutionWrapperGeneratorBase(object): with Indentation(gen): gen("raise TypeError(\"value argument '%s' " "was not given and could not be automatically " - "determined\")" % arg.name) + 'determined")' % arg.name) gen("# }}}") gen("") @@ -411,7 +408,7 @@ class ExecutionWrapperGeneratorBase(object): gen("if %s is None:" % arg.name) with Indentation(gen): gen("raise RuntimeError(\"input argument '%s' must " - "be supplied\")" % arg.name) + 'be supplied")' % arg.name) gen("") if (is_written @@ -420,14 +417,14 @@ class ExecutionWrapperGeneratorBase(object): gen("if %s is None:" % arg.name) with Indentation(gen): gen("raise RuntimeError(\"written image '%s' must " - "be supplied\")" % arg.name) + 'be supplied")' % arg.name) gen("") if is_written and arg.shape is None and not options.skip_arg_checks: gen("if %s is None:" % arg.name) with Indentation(gen): gen("raise RuntimeError(\"written argument '%s' has " - "unknown shape and must be supplied\")" % arg.name) + 'unknown shape and must be supplied")' % arg.name) gen("") possibly_made_by_loopy = False @@ -470,7 +467,7 @@ class ExecutionWrapperGeneratorBase(object): program_arg.dtype.numpy_dtype))) with Indentation(gen): gen("raise TypeError(\"dtype mismatch on argument '%s' " - "(got: %%s, expected: %s)\" %% %s.dtype)" + '(got: %%s, expected: %s)" %% %s.dtype)' % (arg.name, arg.dtype, arg.name)) # {{{ generate shape checking code @@ -491,7 +488,7 @@ class ExecutionWrapperGeneratorBase(object): shape_mismatch_msg = ( "raise TypeError(\"shape mismatch on argument '%s' " - "(got: %%s, expected: %%s)\" " + '(got: %%s, expected: %%s)" ' "%% (%s.shape, %s))" % (arg.name, arg.name, strify_tuple(arg.unvec_shape))) @@ -530,8 +527,9 @@ class ExecutionWrapperGeneratorBase(object): shape = ["_lpy_shape_%d" % i for i in range(ndim)] strides = ["_lpy_stride_%d" % i for i in range(ndim)] - gen("(%s,) = %s.shape" % (", ".join(shape), arg.name)) - gen("(%s,) = %s.strides" % (", ".join(strides), arg.name)) + gen("({},) = {}.shape".format(", ".join(shape), arg.name)) + gen("({},) = {}.strides".format( + ", ".join(strides), arg.name)) gen("if not (%s):" % self.get_strides_check_expr( @@ -547,21 +545,21 @@ class ExecutionWrapperGeneratorBase(object): "if dim > 1)" % (arg.name, strify_tuple(sym_strides))) - gen("raise TypeError(\"strides mismatch on " + gen('raise TypeError("strides mismatch on ' "argument '%s' " "(after removing unit length dims, " - "got: %%s, expected: %%s)\" " + 'got: %%s, expected: %%s)" ' "%% (_lpy_got, _lpy_expected))" % arg.name) if not arg.allows_offset: - gen("if hasattr(%s, 'offset') and %s.offset:" % ( + gen("if hasattr({}, 'offset') and {}.offset:".format( arg.name, arg.name)) with Indentation(gen): gen("raise ValueError(\"Argument '%s' does not " "allow arrays with offsets. Try passing " - "default_offset=loopy.auto to make_program()." - "\")" % arg.name) + "default_offset=loopy.auto to make_kernel()." + '")' % arg.name) gen("") # }}} @@ -691,7 +689,7 @@ class _KernelInfo(ImmutableRecord): pass -class _Kernels(object): +class _Kernels: pass @@ -707,7 +705,7 @@ invoker_cache = WriteOncePersistentDict( # {{{ kernel executor -class KernelExecutorBase(object): +class KernelExecutorBase: """An object connecting a kernel to a :class:`pyopencl.Context` for execution. @@ -797,7 +795,7 @@ class KernelExecutorBase(object): impl_arg_to_arg = self.program.impl_arg_to_arg arg_to_dtype = {} - for arg_name, val in six.iteritems(kwargs): + for arg_name, val in kwargs.items(): arg = impl_arg_to_arg.get(arg_name, None) if arg is None: @@ -812,7 +810,7 @@ class KernelExecutorBase(object): else: arg_to_dtype[arg_name] = dtype - return frozenset(six.iteritems(arg_to_dtype)) + return frozenset(arg_to_dtype.items()) # {{{ debugging aids @@ -833,7 +831,7 @@ class KernelExecutorBase(object): if arg_to_dtype is not None: arg_to_dtype = frozenset( - (k, process_dtype(v)) for k, v in six.iteritems(arg_to_dtype)) + (k, process_dtype(v)) for k, v in arg_to_dtype.items()) kernel = self.get_typed_and_scheduled_program(arg_to_dtype) @@ -844,6 +842,9 @@ class KernelExecutorBase(object): def get_invoker_uncached(self, kernel, *args): raise NotImplementedError() + def get_wrapper_generator(self): + raise NotImplementedError() + def get_invoker(self, kernel, *args): from loopy import CACHING_ENABLED diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 812bf3a560b191bd61f5b86cb401983ad97467a2..6558ac0ec40ca39d7ae429edb5e401ade7d16958 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -1,6 +1,5 @@ """Target for Intel ISPC.""" -from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2015 Andreas Kloeckner" @@ -93,7 +92,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): return expr else: - return super(ExprToISPCExprMapper, self).map_variable( + return super().map_variable( expr, type_context) def map_subscript(self, expr, type_context): @@ -117,7 +116,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): subscript, = access_info.subscripts result = var(access_info.array_name)[ - var("programIndex") + self.rec(lsize*subscript, 'i')] + var("programIndex") + self.rec(lsize*subscript, "i")] if access_info.vector_index is not None: return self.kernel.target.add_vector_access( @@ -125,7 +124,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): else: return result - return super(ExprToISPCExprMapper, self).map_subscript( + return super().map_subscript( expr, type_context) # }}} @@ -167,7 +166,7 @@ class ISPCTarget(CFamilyTarget): """ self.occa_mode = occa_mode - super(ISPCTarget, self).__init__() + super().__init__() host_program_name_suffix = "" device_program_name_suffix = "_inner" @@ -274,7 +273,7 @@ class ISPCASTBuilder(CFamilyASTBuilder): result.append( ISPCLaunch( tuple(ecm(gs_i, PREC_NONE) for gs_i in gsize), - "%s(%s)" % ( + "{}({})".format( name, ", ".join(arg_names) ))) @@ -352,7 +351,7 @@ class ISPCASTBuilder(CFamilyASTBuilder): dtype, is_written) def get_value_arg_decl(self, name, shape, dtype, is_written): - result = super(ISPCASTBuilder, self).get_value_arg_decl( + result = super().get_value_arg_decl( name, shape, dtype, is_written) from cgen import Reference, Const @@ -476,7 +475,7 @@ class ISPCASTBuilder(CFamilyASTBuilder): "streaming_store(%s + %s, %s)" % ( access_info.array_name, - ecm(flattened_sum(new_terms), PREC_NONE, 'i'), + ecm(flattened_sum(new_terms), PREC_NONE, "i"), rhs_code)) # }}} diff --git a/loopy/target/numba.py b/loopy/target/numba.py index 6946063ee04f52a4890344b4cbff9446bacb6923..2df81ec1f332be87d8ca361480a37b68b369b56f 100644 --- a/loopy/target/numba.py +++ b/loopy/target/numba.py @@ -1,6 +1,5 @@ """Python host AST builder for integration with PyOpenCL.""" -from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2016 Andreas Kloeckner" @@ -44,7 +43,7 @@ def _base_numba_preamble_generator(preamble_info): class NumbaBaseASTBuilder(PythonASTBuilderBase): def preamble_generators(self): return ( - super(NumbaBaseASTBuilder, self).preamble_generators() + [ + super().preamble_generators() + [ _base_numba_preamble_generator ]) @@ -72,7 +71,7 @@ class NumbaBaseASTBuilder(PythonASTBuilderBase): implemented_data_info = codegen_state.implemented_data_info return Statement( - "%s[%s, %s](%s)" % ( + "{}[{}, {}]({})".format( name, ecm(gsize, PREC_NONE), ecm(lsize, PREC_NONE), @@ -155,7 +154,7 @@ def _cuda_numba_preamble_generator(preamble_info): class NumbaCudaASTBuilder(NumbaBaseASTBuilder): def preamble_generators(self): return ( - super(NumbaCudaASTBuilder, self).preamble_generators() + [ + super().preamble_generators() + [ _cuda_numba_preamble_generator ]) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 6b7ef1b886d0620ef4fdbb9ccb1d208bba43f14f..0cc93ca289d641fda488e08115df3834371aacb8 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -1,6 +1,5 @@ """OpenCL target independent of PyOpenCL.""" -from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2015 Andreas Kloeckner" @@ -48,7 +47,7 @@ class DTypeRegistryWrapperWithAtomics(DTypeRegistryWrapper): return super(self.wrapped_registry.get_or_register_dtype( names, NumpyType(dtype.dtype))) - return super(DTypeRegistryWrapperWithAtomics, self).get_or_register_dtype( + return super().get_or_register_dtype( names, dtype) @@ -59,7 +58,7 @@ class DTypeRegistryWrapperWithCL1Atomics(DTypeRegistryWrapperWithAtomics): if isinstance(dtype, AtomicNumpyType): return "volatile " + self.wrapped_registry.dtype_to_ctype(dtype) else: - return super(DTypeRegistryWrapperWithCL1Atomics, self).dtype_to_ctype( + return super().dtype_to_ctype( dtype) # }}} @@ -81,16 +80,16 @@ def _create_vector_types(): counts = [2, 3, 4, 8, 16] for base_name, base_type in [ - ('char', np.int8), - ('uchar', np.uint8), - ('short', np.int16), - ('ushort', np.uint16), - ('int', np.int32), - ('uint', np.uint32), - ('long', np.int64), - ('ulong', np.uint64), - ('float', np.float32), - ('double', np.float64), + ("char", np.int8), + ("uchar", np.uint8), + ("short", np.int16), + ("ushort", np.uint16), + ("int", np.int32), + ("uint", np.uint32), + ("long", np.int64), + ("ulong", np.uint64), + ("float", np.float32), + ("double", np.float64), ]: for count in counts: name = "%s%d" % (base_name, count) @@ -148,22 +147,22 @@ _CL_SIMPLE_MULTI_ARG_FUNCTIONS = { } -VECTOR_LITERAL_FUNCS = dict( - ("make_%s%d" % (name, count), (name, dtype, count)) +VECTOR_LITERAL_FUNCS = { + "make_%s%d" % (name, count): (name, dtype, count) for name, dtype in [ - ('char', np.int8), - ('uchar', np.uint8), - ('short', np.int16), - ('ushort', np.uint16), - ('int', np.int32), - ('uint', np.uint32), - ('long', np.int64), - ('ulong', np.uint64), - ('float', np.float32), - ('double', np.float64), + ("char", np.int8), + ("uchar", np.uint8), + ("short", np.int16), + ("ushort", np.uint16), + ("int", np.int32), + ("uint", np.uint32), + ("long", np.int64), + ("ulong", np.uint64), + ("float", np.float32), + ("double", np.float64), ] for count in [2, 3, 4, 8, 16] - ) + } class OpenCLCallable(ScalarCallable): @@ -187,9 +186,9 @@ class OpenCLCallable(ScalarCallable): [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() if (id >= 0 and dtype is not None)]) - if dtype.kind in ['u', 'i', 'f']: - if dtype.kind == 'f': - name = 'f'+name + if dtype.kind in ["u", "i", "f"]: + if dtype.kind == "f": + name = "f"+name dtype = NumpyType(dtype) return ( self.copy(name_in_target=name, @@ -243,8 +242,8 @@ class OpenCLCallable(ScalarCallable): raise LoopyError("%s does not support complex numbers" % name) - updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, - num_args)) + updated_arg_id_to_dtype = {id: NumpyType(dtype) for id in range(-1, + num_args)} return ( self.copy(name_in_target=name, @@ -267,8 +266,8 @@ class OpenCLCallable(ScalarCallable): self.copy(arg_id_to_dtype=arg_id_to_dtype), callables_table) - updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in - range(count)) + updated_arg_id_to_dtype = {id: NumpyType(dtype) for id in + range(count)} updated_arg_id_to_dtype[-1] = OpenCLTarget().vector_dtype( NumpyType(dtype), count) @@ -289,7 +288,7 @@ def scope_opencl_functions(target, identifier): Returns an instance of :class:`InKernelCallable` if the function defined by *identifier* is known in OpenCL. """ - opencl_function_ids = set(["max", "min", "dot"]) | set( + opencl_function_ids = {"max", "min", "dot"} | set( _CL_SIMPLE_MULTI_ARG_FUNCTIONS) | set(VECTOR_LITERAL_FUNCS) if identifier in opencl_function_ids: @@ -391,7 +390,7 @@ class OpenCLTarget(CFamilyTarget): for floating point), ``"cl1-exch"`` (OpenCL 1.1 atomics, using double-exchange for floating point--not yet supported). """ - super(OpenCLTarget, self).__init__() + super().__init__() if atomics_flavor is None: atomics_flavor = "cl1" @@ -443,19 +442,19 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): def function_id_in_knl_callable_mapper(self): return ( - [scope_opencl_functions] + super( - OpenCLCASTBuilder, self).function_id_in_knl_callable_mapper()) + [scope_opencl_functions] + + super().function_id_in_knl_callable_mapper()) def symbol_manglers(self): return ( - super(OpenCLCASTBuilder, self).symbol_manglers() + [ + super().symbol_manglers() + [ opencl_symbol_mangler ]) def preamble_generators(self): return ( - super(OpenCLCASTBuilder, self).preamble_generators() + [ + super().preamble_generators() + [ opencl_preamble_generator]) # }}} @@ -464,7 +463,7 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): def get_function_declaration(self, codegen_state, codegen_result, schedule_index): - fdecl = super(OpenCLCASTBuilder, self).get_function_declaration( + fdecl = super().get_function_declaration( codegen_state, codegen_result, schedule_index) from loopy.target.c import FunctionDeclarationWrapper @@ -529,7 +528,7 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): mem_kind = mem_kind.upper() from cgen import Statement - return Statement("barrier(CLK_%s_MEM_FENCE)%s" % (mem_kind, comment)) + return Statement(f"barrier(CLK_{mem_kind}_MEM_FENCE){comment}") elif synchronization_kind == "global": raise LoopyError("OpenCL does not have global barriers") else: @@ -554,13 +553,13 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): from loopy.kernel.data import AddressSpace if mem_address_space == AddressSpace.LOCAL: - return CLLocal(super(OpenCLCASTBuilder, self).get_array_arg_decl( + return CLLocal(super().get_array_arg_decl( name, mem_address_space, shape, dtype, is_written)) elif mem_address_space == AddressSpace.PRIVATE: - return super(OpenCLCASTBuilder, self).get_array_arg_decl( + return super().get_array_arg_decl( name, mem_address_space, shape, dtype, is_written) elif mem_address_space == AddressSpace.GLOBAL: - return CLGlobal(super(OpenCLCASTBuilder, self).get_array_arg_decl( + return CLGlobal(super().get_array_arg_decl( name, mem_address_space, shape, dtype, is_written)) else: raise ValueError("unexpected array argument scope: %s" @@ -627,8 +626,10 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): from loopy.kernel.data import TemporaryVariable, AddressSpace ecm = codegen_state.expression_to_code_mapper.with_assignments( { - old_val_var: TemporaryVariable(old_val_var, lhs_dtype), - new_val_var: TemporaryVariable(new_val_var, lhs_dtype), + old_val_var: TemporaryVariable(old_val_var, lhs_dtype, + shape=()), + new_val_var: TemporaryVariable(new_val_var, lhs_dtype, + shape=()), }) lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None) @@ -688,7 +689,7 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): old_val = "*(%s *) &" % ctype + old_val new_val = "*(%s *) &" % ctype + new_val - cast_str = "(%s %s *) " % (var_kind, ctype) + cast_str = f"({var_kind} {ctype} *) " return Block([ POD(self, NumpyType(lhs_dtype.dtype, target=self.target), diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index c11c309614a78486635e330ed00aac46abc123fb..2008c92246daed4ae853177d351f43103cb73db3 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -1,9 +1,5 @@ """OpenCL target integrated with PyOpenCL.""" -from __future__ import division, absolute_import - -import sys - __copyright__ = "Copyright (C) 2015 Andreas Kloeckner" __license__ = """ @@ -26,9 +22,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six -from six.moves import range - import numpy as np from loopy.target.opencl import OpenCLTarget, OpenCLCASTBuilder @@ -55,7 +48,7 @@ def adjust_local_temp_var_storage(kernel, device): from loopy.kernel.data import AddressSpace lmem_size = cl_char.usable_local_mem_size(device) - for temp_var in six.itervalues(kernel.temporary_variables): + for temp_var in kernel.temporary_variables.values(): if temp_var.address_space != AddressSpace.LOCAL: new_temp_vars[temp_var.name] = \ temp_var.copy(storage_shape=temp_var.shape) @@ -68,7 +61,7 @@ def adjust_local_temp_var_storage(kernel, device): other_loctemp_nbytes = [ tv.nbytes - for tv in six.itervalues(kernel.temporary_variables) + for tv in kernel.temporary_variables.values() if tv.address_space == AddressSpace.LOCAL and tv.name != temp_var.name] @@ -236,7 +229,7 @@ class PyOpenCLCallable(ScalarCallable): raise LoopyTypeError("unexpected complex type '%s'" % dtype) return ( - self.copy(name_in_target="%s_%s" % (tpname, name), + self.copy(name_in_target=f"{tpname}_{name}", arg_id_to_dtype={0: dtype, -1: NumpyType( np.dtype(dtype.numpy_dtype.type(0).real))}), callables_table) @@ -255,16 +248,16 @@ class PyOpenCLCallable(ScalarCallable): raise LoopyTypeError("unexpected complex type '%s'" % dtype) return ( - self.copy(name_in_target="%s_%s" % (tpname, name), + self.copy(name_in_target=f"{tpname}_{name}", arg_id_to_dtype={0: dtype, -1: dtype}), callables_table) else: # function calls for floating parameters. numpy_dtype = dtype.numpy_dtype - if numpy_dtype.kind in ('u', 'i'): + if numpy_dtype.kind in ("u", "i"): dtype = dtype.copy(numpy_dtype=np.float32) - if name == 'abs': - name = 'fabs' + if name == "abs": + name = "fabs" return ( self.copy(name_in_target=name, arg_id_to_dtype={0: dtype, -1: dtype}), @@ -316,7 +309,7 @@ def pyopencl_preamble_generator(preamble_info): # {{{ pyopencl tools -class _LegacyTypeRegistryStub(object): +class _LegacyTypeRegistryStub: """Adapts legacy PyOpenCL type registry to be usable with PyOpenCLTarget.""" def get_or_register_dtype(self, names, dtype=None): @@ -338,6 +331,9 @@ class PyOpenCLTarget(OpenCLTarget): warnings) and support for complex numbers. """ + # FIXME make prefixes conform to naming rules + # (see Reference: Loopy’s Model of a Kernel) + host_program_name_prefix = "_lpy_host_" host_program_name_suffix = "" @@ -346,7 +342,7 @@ class PyOpenCLTarget(OpenCLTarget): # This ensures the dtype registry is populated. import pyopencl.tools # noqa - super(PyOpenCLTarget, self).__init__( + super().__init__( atomics_flavor=atomics_flavor) self.device = device @@ -359,7 +355,7 @@ class PyOpenCLTarget(OpenCLTarget): "pyopencl_module_name",) def __eq__(self, other): - if not super(PyOpenCLTarget, self).__eq__(other): + if not super().__eq__(other): return False if (self.device is None) != (other.device is None): @@ -367,20 +363,21 @@ class PyOpenCLTarget(OpenCLTarget): if self.device is not None: assert other.device is not None - return (self.device.persistent_unique_id - == other.device.persistent_unique_id) + return (self.device.hashable_model_and_version_identifier + == other.device.hashable_model_and_version_identifier) else: assert other.device is None return True def update_persistent_hash(self, key_hash, key_builder): - super(PyOpenCLTarget, self).update_persistent_hash(key_hash, key_builder) - key_builder.rec(key_hash, getattr(self.device, "persistent_unique_id", None)) + super().update_persistent_hash(key_hash, key_builder) + key_builder.rec(key_hash, getattr( + self.device, "hashable_model_and_version_identifier", None)) def __getstate__(self): dev_id = None if self.device is not None: - dev_id = self.device.persistent_unique_id + dev_id = self.device.hashable_model_and_version_identifier return { "device_id": dev_id, @@ -403,7 +400,7 @@ class PyOpenCLTarget(OpenCLTarget): dev for plat in cl.get_platforms() for dev in plat.get_devices() - if dev.persistent_unique_id == dev_id] + if dev.hashable_model_and_version_identifier == dev_id] if matches: self.device = matches[0] @@ -568,12 +565,9 @@ def generate_value_arg_setup(kernel, devices, implemented_data_info): if idi.dtype.is_integral(): gen(Comment("cast to Python int to avoid trouble " "with struct packing or Boost.Python")) - if sys.version_info < (3,): - py_type = "long" - else: - py_type = "int" + py_type = "int" - gen(Assign(idi.name, "%s(%s)" % (py_type, idi.name))) + gen(Assign(idi.name, f"{py_type}({idi.name})")) gen(Line()) if idi.dtype.is_composite(): @@ -692,7 +686,7 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): + ["wait_for=None", "allocator=None"]) from genpy import (For, Function, Suite, Import, ImportAs, Return, - FromImport, If, Assign, Line, Statement as S) + FromImport, Line, Statement as S) return Function( codegen_result.current_program(codegen_state).name, args, @@ -701,11 +695,6 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): ImportAs("pyopencl", "_lpy_cl"), Import("pyopencl.tools"), Line(), - If("allocator is None", - Assign( - "allocator", - "_lpy_cl_tools.DeferredAllocator(queue.context)")), - Line(), ] + [ Line(), function_body, @@ -728,14 +717,14 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): from genpy import Assign, Comment, Line def alloc_nbytes(tv): - from six.moves import reduce + from functools import reduce from operator import mul return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1) from loopy.kernel.data import AddressSpace global_temporaries = sorted( - (tv for tv in six.itervalues(codegen_state.kernel.temporary_variables) + (tv for tv in codegen_state.kernel.temporary_variables.values() if tv.address_space == AddressSpace.GLOBAL), key=lambda tv: tv.name) @@ -780,6 +769,13 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): from genpy import Suite, Assign, Assert, Line, Comment from pymbolic.mapper.stringifier import PREC_NONE + import pyopencl.version as cl_ver + if cl_ver.VERSION < (2020, 2): + from warnings import warn + warn("Your kernel invocation will likely fail because your " + "version of PyOpenCL does not support allow_empty_ndrange. " + "Please upgrade to version 2020.2 or newer.") + # TODO: Generate finer-grained dependency structure return Suite([ Comment("{{{ enqueue %s" % name), @@ -791,7 +787,8 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): arry_arg_code, Assign("_lpy_evt", "%(pyopencl_module_name)s.enqueue_nd_range_kernel(" "queue, _lpy_knl, " - "%(gsize)s, %(lsize)s, wait_for=wait_for, g_times_l=True)" + "%(gsize)s, %(lsize)s, wait_for=wait_for, " + "g_times_l=True, allow_empty_ndrange=True)" % dict( pyopencl_module_name=self.target.pyopencl_module_name, gsize=ecm(gsize, prec=PREC_NONE, type_context="i"), @@ -820,83 +817,19 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): random123_function_id_to_in_knl_callable_mapper) return ( [pyopencl_function_id_to_in_knl_callable_mapper, - random123_function_id_to_in_knl_callable_mapper] + super( - PyOpenCLCASTBuilder, self).function_id_in_knl_callable_mapper()) + random123_function_id_to_in_knl_callable_mapper] + + super().function_id_in_knl_callable_mapper()) def preamble_generators(self): return ([ pyopencl_preamble_generator, - ] + super(PyOpenCLCASTBuilder, self).preamble_generators()) + ] + super().preamble_generators()) # }}} # }}} -class NvidiaPyOpenCLTarget(PyOpenCLTarget): - def __init__(self, device, pyopencl_module_name="_lpy_cl", - atomics_flavor=None): - import pyopencl as cl - assert isinstance(device, cl.Device) - assert device.vendor == 'NVIDIA Corporation' - - super(NvidiaPyOpenCLTarget, self).__init__(device, - pyopencl_module_name, atomics_flavor) - - def preprocess(self, kernel): - from loopy import set_options - if self.device.compute_capability_major_nv >= 6: - build_options = ['-cl-nv-arch', 'sm_60'] + ( - kernel.options.cl_build_options) - kernel = set_options(kernel, cl_build_options=build_options) - return super(NvidiaPyOpenCLTarget, self).preprocess(kernel) - - def get_device_ast_builder(self): - # here we should have an if else condition - if self.device.compute_capability_major_nv >= 6: - return NvidiaPyOpenCLCASTBuilder(self) - else: - return super(NvidiaPyOpenCLTarget, self).get_device_ast_builder() - - -class NvidiaPyOpenCLCASTBuilder(PyOpenCLCASTBuilder): - def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, - lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): - - from pymbolic.primitives import Sum - from cgen import Statement, Block, Assign - from loopy.target.c import POD - - if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype == np.float64: - # atomicAdd - if isinstance(rhs_expr, Sum): - - old_val_var = codegen_state.var_name_generator("loopy_old_val") - - from loopy.kernel.data import TemporaryVariable - ecm = codegen_state.expression_to_code_mapper.with_assignments( - { - old_val_var: TemporaryVariable(old_val_var, lhs_dtype), - }) - - new_rhs_expr = Sum(tuple(c for c in rhs_expr.children - if c != lhs_expr)) - lhs_expr_code = ecm(lhs_expr) - rhs_expr_code = ecm(new_rhs_expr) - - return Block([ - POD(self, NumpyType(lhs_dtype.dtype, target=self.target), - old_val_var), - Assign(old_val_var, lhs_expr_code), - Statement('asm volatile("atom.global.add.f64 %0, [%1], %2;" :' - '"=d"({0}) : "l"(&{1}) , "d"({2}))'.format( - old_val_var, lhs_expr_code, rhs_expr_code))]) - - return super(NvidiaPyOpenCLCASTBuilder, - self).emit_atomic_update(codegen_state, lhs_atomicity, lhs_var, - lhs_expr, rhs_expr, lhs_dtype, rhs_type_context) - - # {{{ volatile mem acccess target class VolatileMemPyOpenCLCASTBuilder(PyOpenCLCASTBuilder): diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index b7006575bb05561e29320f092935e7bb5dcab006..269a0ef0956fd106e284d3b1c7ae513e83a71234 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -1,5 +1,3 @@ -from __future__ import division, with_statement, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -22,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from six.moves import range, zip from pytools import memoize_method from pytools.py_codegen import Indentation @@ -50,14 +47,14 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # ignored if options.no_numpy "out_host=None" ] - super(PyOpenCLExecutionWrapperGenerator, self).__init__(system_args) + super().__init__(system_args) def python_dtype_str(self, dtype): import pyopencl.tools as cl_tools if dtype.isbuiltin: return "_lpy_np."+dtype.name else: - return ("_lpy_cl_tools.get_or_register_dtype(\"%s\")" + return ('_lpy_cl_tools.get_or_register_dtype("%s")' % cl_tools.dtype_to_ctype(dtype)) # {{{ handle non-numpy args @@ -65,6 +62,8 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): def handle_non_numpy_arg(self, gen, arg): gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name) with Indentation(gen): + gen("# retain originally passed array") + gen(f"_lpy_{arg.name}_np_input = {arg.name}") gen("# synchronous, nothing to worry about") gen("%s = _lpy_cl_array.to_device(" "queue, %s, allocator=allocator)" @@ -73,16 +72,20 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): gen("elif %s is not None:" % arg.name) with Indentation(gen): gen("_lpy_encountered_dev = True") + gen("_lpy_%s_np_input = None" % arg.name) + gen("else:") + with Indentation(gen): + gen("_lpy_%s_np_input = None" % arg.name) gen("") # }}} - # {{{ handle allocation of unspecified arguements + # {{{ handle allocation of unspecified arguments def handle_alloc(self, gen, arg, kernel_arg, strify, skip_arg_checks): """ - Handle allocation of non-specified arguements for pyopencl execution + Handle allocation of non-specified arguments for pyopencl execution """ from pymbolic import var @@ -142,7 +145,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): def initialize_system_args(self, gen): """ - Initializes possibly empty system arguements + Initializes possibly empty system arguments """ gen("if allocator is None:") with Indentation(gen): @@ -184,7 +187,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): if (issubclass(arg.arg_class, ArrayArg) and arg.base_name in ( program.root_kernel.get_written_variables())): - gen("{arg_name}.add_event(_lpy_evt)".format(arg_name=arg.name)) + gen(f"{arg.name}.add_event(_lpy_evt)") # }}} @@ -201,23 +204,24 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): with Indentation(gen): gen("out_host = True") - gen("if out_host:") - with Indentation(gen): - gen("pass") # if no outputs (?!) - for arg in implemented_data_info: - if not issubclass(arg.arg_class, KernelArgument): - continue - - is_written = arg.base_name in ( - program.root_kernel.get_written_variables()) - if is_written: - gen("%s = %s.get(queue=queue)" % (arg.name, arg.name)) + for arg in implemented_data_info: + if not issubclass(arg.arg_class, KernelArgument): + continue + + is_written = (arg.base_name in + program.root_kernel.get_written_variables()) + if is_written: + np_name = "_lpy_%s_np_input" % arg.name + gen("if out_host or %s is not None:" % np_name) + with Indentation(gen): + gen("%s = %s.get(queue=queue, ary=%s)" + % (arg.name, arg.name, np_name)) gen("") if options.return_dict: gen("return _lpy_evt, {%s}" - % ", ".join("\"%s\": %s" % (arg.name, arg.name) + % ", ".join(f'"{arg.name}": {arg.name}' for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) if arg.base_name in @@ -264,7 +268,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): specific arguments. """ - super(PyOpenCLKernelExecutor, self).__init__(program) + super().__init__(program) self.context = context @@ -277,6 +281,9 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): generator = PyOpenCLExecutionWrapperGenerator() return generator(kernel, codegen_result) + def get_wrapper_generator(self): + return PyOpenCLExecutionWrapperGenerator() + @memoize_method def program_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): program = self.get_typed_and_scheduled_program(arg_to_dtype_set) @@ -321,7 +328,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): def __call__(self, queue, **kwargs): """ :arg allocator: a callable passed a byte count and returning - a :class:`pyopencl.Buffer`. A :class:`pyopencl` allocator + a :class:`pyopencl.Buffer`. A :mod:`pyopencl` allocator maybe. :arg wait_for: A list of :class:`pyopencl.Event` instances for which to wait. diff --git a/loopy/target/python.py b/loopy/target/python.py index 1f83112ff8fd9f32f2e48f3c76a3de0abaad92fd..c27b4484d29b8dae7ddc83c4ae80221c9afb8e29 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -1,6 +1,5 @@ """Python host AST builder for integration with PyOpenCL.""" -from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2016 Andreas Kloeckner" @@ -24,7 +23,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six import numpy as np from pymbolic.mapper import Mapper @@ -52,7 +50,7 @@ class ExpressionToPythonMapper(StringifyMapper): return Mapper.handle_unsupported_expression(self, victim, enclosing_prec) def rec(self, expr, prec, type_context=None, needed_dtype=None): - return super(ExpressionToPythonMapper, self).rec(expr, prec) + return super().rec(expr, prec) __call__ = rec @@ -67,19 +65,19 @@ class ExpressionToPythonMapper(StringifyMapper): enclosing_prec)) if expr.name in self.kernel.all_inames(): - return super(ExpressionToPythonMapper, self).map_variable( + return super().map_variable( expr, enclosing_prec) var_descr = self.kernel.get_var_descriptor(expr.name) if isinstance(var_descr, ValueArg): - return super(ExpressionToPythonMapper, self).map_variable( + return super().map_variable( expr, enclosing_prec) - return super(ExpressionToPythonMapper, self).map_variable( + return super().map_variable( expr, enclosing_prec) def map_subscript(self, expr, enclosing_prec): - return super(ExpressionToPythonMapper, self).map_subscript( + return super().map_subscript( expr, enclosing_prec) def map_call(self, expr, enclosing_prec): @@ -113,7 +111,8 @@ class ExpressionToPythonMapper(StringifyMapper): str_parameters = [self.rec(par, PREC_NONE) for par in expr.parameters] - return "%s(%s)" % (in_knl_callable.name_in_target, ", ".join(str_parameters)) + return "{}({})".format(in_knl_callable.name_in_target, + ", ".join(str_parameters)) def map_group_hw_index(self, expr, enclosing_prec): raise LoopyError("plain Python does not have group hw axes") @@ -144,8 +143,7 @@ class ExpressionToPythonMapper(StringifyMapper): class Collection(Suite): def generate(self): for item in self.contents: - for item_line in item.generate(): - yield item_line + yield from item.generate() # }}} @@ -183,13 +181,12 @@ class PythonASTBuilderBase(ASTBuilderBase): def function_id_in_knl_callable_mapper(self): from loopy.target.c import scope_c_math_functions return ( - super(PythonASTBuilderBase, - self).function_id_in_knl_callable_mapper() + + super().function_id_in_knl_callable_mapper() + [scope_c_math_functions]) def preamble_generators(self): return ( - super(PythonASTBuilderBase, self).preamble_generators() + [ + super().preamble_generators() + [ _base_python_preamble_generator ]) @@ -219,7 +216,7 @@ class PythonASTBuilderBase(ASTBuilderBase): from genpy import Assign for tv in sorted( - six.itervalues(kernel.temporary_variables), + kernel.temporary_variables.values(), key=lambda tv: tv.name): if tv.shape: result.append( diff --git a/loopy/tools.py b/loopy/tools.py index 524638e4a9fb2c624c49a3da53fd3cbdeca907c8..e8d529d2da3ad87c649371bcee7bdcc22ad407fb 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -22,13 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six - -try: - import collections.abc as abc -except ImportError: - # Python 2 - import collections as abc +import collections.abc as abc import numpy as np from pytools import memoize_method @@ -36,18 +28,11 @@ from pytools.persistent_dict import KeyBuilder as KeyBuilderBase from loopy.symbolic import WalkMapper as LoopyWalkMapper from pymbolic.mapper.persistent_hash import ( PersistentHashWalkMapper as PersistentHashWalkMapperBase) -import six # noqa -from six.moves import intern -import re -from mako.template import Template -import loopy as lp +from sys import intern + -if six.PY2: - def is_integer(obj): - return isinstance(obj, (int, long, np.integer)) # noqa pylint:disable=undefined-variable -else: - def is_integer(obj): - return isinstance(obj, (int, np.integer)) +def is_integer(obj): + return isinstance(obj, (int, np.integer)) def update_persistent_hash(obj, key_hash, key_builder): @@ -91,7 +76,7 @@ class LoopyKeyBuilder(KeyBuilderBase): def update_for_dict(self, key_hash, key): # Order matters for the hash--insert in sorted order. - for dict_key in sorted(six.iterkeys(key), key=lambda obj: + for dict_key in sorted(key.keys(), key=lambda obj: type(obj).__name__ + str(obj)): self.rec(key_hash, (dict_key, key[dict_key])) @@ -121,7 +106,7 @@ class LoopyKeyBuilder(KeyBuilderBase): % type(key)) def update_for_type_auto(self, key_hash, key): - key_hash.update("auto".encode("utf8")) + key_hash.update(b"auto") def update_for_pymbolic_expression(self, key_hash, key): if key is None: @@ -130,7 +115,7 @@ class LoopyKeyBuilder(KeyBuilderBase): PersistentHashWalkMapper(key_hash)(key) -class PymbolicExpressionHashWrapper(object): +class PymbolicExpressionHashWrapper: def __init__(self, expression): self.expression = expression @@ -149,7 +134,7 @@ class PymbolicExpressionHashWrapper(object): # {{{ eq key builder -class LoopyEqKeyBuilder(object): +class LoopyEqKeyBuilder: """Unlike :class:`loopy.tools.LoopyKeyBuilder`, this builds keys for use in equality comparison, such that `key(a) == key(b)` if and only if `a == b`. The types of objects being compared should satisfy structural equality. @@ -229,11 +214,11 @@ def remove_common_indentation(code, require_leading_newline=True, test_line = None if ignore_lines_starting_with: - for l in lines: - strip_l = l.lstrip() + for line in lines: + strip_l = line.lstrip() if (strip_l and not strip_l.startswith(ignore_lines_starting_with)): - test_line = l + test_line = line break else: @@ -336,8 +321,8 @@ def cptr_from_numpy(obj): # https://github.com/hgomersall/pyFFTW/blob/master/pyfftw/utils.pxi#L172 -def empty_aligned(shape, dtype, order='C', n=64): - '''empty_aligned(shape, dtype='float64', order='C', n=None) +def empty_aligned(shape, dtype, order="C", n=64): + """empty_aligned(shape, dtype='float64', order="C", n=None) Function that returns an empty numpy array that is n-byte aligned, where ``n`` is determined by inspecting the CPU if it is not provided. @@ -345,7 +330,7 @@ def empty_aligned(shape, dtype, order='C', n=64): ``n`` is not provided then this function will inspect the CPU to determine alignment. The rest of the arguments are as per :func:`numpy.empty`. - ''' + """ itemsize = np.dtype(dtype).itemsize # Apparently there is an issue with numpy.prod wrapping around on 32-bits @@ -374,68 +359,9 @@ def empty_aligned(shape, dtype, order='C', n=64): # }}} -# {{{ compute SCCs with Tarjan's algorithm - -def compute_sccs(graph): - to_search = set(graph.keys()) - visit_order = {} - scc_root = {} - sccs = [] - - while to_search: - top = next(iter(to_search)) - call_stack = [(top, iter(graph[top]), None)] - visit_stack = [] - visiting = set() - - scc = [] - - while call_stack: - top, children, last_popped_child = call_stack.pop() - - if top not in visiting: - # Unvisited: mark as visited, initialize SCC root. - count = len(visit_order) - visit_stack.append(top) - visit_order[top] = count - scc_root[top] = count - visiting.add(top) - to_search.discard(top) - - # Returned from a recursion, update SCC. - if last_popped_child is not None: - scc_root[top] = min( - scc_root[top], - scc_root[last_popped_child]) - - for child in children: - if child not in visit_order: - # Recurse. - call_stack.append((top, children, child)) - call_stack.append((child, iter(graph[child]), None)) - break - if child in visiting: - scc_root[top] = min( - scc_root[top], - visit_order[child]) - else: - if scc_root[top] == visit_order[top]: - scc = [] - while visit_stack[-1] != top: - scc.append(visit_stack.pop()) - scc.append(visit_stack.pop()) - for item in scc: - visiting.remove(item) - sccs.append(scc) - - return sccs - -# }}} - - # {{{ pickled container value -class _PickledObject(object): +class _PickledObject: """A class meant to wrap a pickled value (for :class:`LazilyUnpicklingDict` and :class:`LazilyUnpicklingList`). """ @@ -508,9 +434,9 @@ class LazilyUnpicklingDict(abc.MutableMapping): return iter(self._map) def __getstate__(self): - return {"_map": dict( - (key, _PickledObject(val)) - for key, val in six.iteritems(self._map))} + return {"_map": { + key: _PickledObject(val) + for key, val in self._map.items()}} # }}} @@ -610,11 +536,11 @@ class LazilyUnpicklingListWithEqAndPersistentHashing(LazilyUnpicklingList): # {{{ optional object -class _no_value(object): # noqa +class _no_value: # noqa pass -class Optional(object): +class Optional: """A wrapper for an optionally present object. .. attribute:: has_value @@ -681,7 +607,7 @@ class Optional(object): def unpickles_equally(obj): - from six.moves.cPickle import loads, dumps + from pickle import loads, dumps return loads(dumps(obj)) == obj @@ -692,123 +618,4 @@ def is_interned(s): def intern_frozenset_of_ids(fs): return frozenset(intern(s) for s in fs) - -def natorder(key): - # Return natural ordering for strings, as opposed to dictionary order. - # E.g. will result in - # 'abc1' < 'abc9' < 'abc10' - # rather than - # 'abc1' < 'abc10' < 'abc9' - # Based on - # http://code.activestate.com/recipes/285264-natural-string-sorting/#c7 - import re - return [int(n) if n else s for n, s in re.findall(r'(\d+)|(\D+)', key)] - - -def natsorted(seq, key=lambda x: x): - return sorted(seq, key=lambda y: natorder(key(y))) - - -def dump_as_python(kernel, filename=None): - """ - Generates a python code for generating *kernel* for sharing kernels. - - :arg kernel: An instance of :class:`loopy.LoopKernel` - :arg filename: An instance of :class:`str`. If *None*, then prints the - python file to *stdout*. - """ - - options = [] - - printed_insn_ids = set() - printed_insn_order = [] - - def insert_insn_into_order(insn): - if insn.id in printed_insn_ids: - return - printed_insn_ids.add(insn.id) - - for dep_id in natsorted(insn.depends_on): - insert_insn_into_order(kernel.id_to_insn[dep_id]) - - printed_insn_order.append(insn) - - for insn in kernel.instructions: - insert_insn_into_order(insn) - - for insn in printed_insn_order: - option = 'id=%s, ' % insn.id - if insn.depends_on: - option += ("dep="+":".join(insn.depends_on)+", ") - if insn.tags: - option += ("tags="+":".join(insn.tags)+", ") - if insn.within_inames: - option += ("inames="+":".join(insn.within_inames)+", ") - if isinstance(insn, lp.MultiAssignmentBase): - if insn.atomicity: - option += "atomic, " - elif isinstance(insn, lp.BarrierInstruction): - option += ("mem_kind=%s, " % insn.mem_kind) - options.append(option[:-2]) - - insn_x_options = zip(printed_insn_order, options) - - python_code = r'''<%! import loopy as lp %>import loopy as lp - import numpy as np - <%! tv_scope = {0: 'lp.AddressSpace.PRIVATE', 1: 'lp.AddressSpace.LOCAL', - 2: 'lp.AddressSpace.GLOBAL', lp.auto: 'lp.auto' } %> - knl = lp.make_kernel( - [ - % for dom in domains: - "${str(dom)}", - % endfor - ], - """ - % for insn, opts in insn_x_opts: - % if isinstance(insn, lp.Assignment): - ${insn.assignee} = ${insn.expression} {${opts}} - % elif isinstance(insn, lp.BarrierInstruction): - ... ${insn.synchronization_kind[0]}barrier{${opts}} - % elif isinstance(insn, lp.NoOpInstruction): - ... nop {${opts}} - % else: - **Not implemented for ${type(insn)}** - % endif - %endfor - """, [ - % for arg in args: - % if isinstance(arg, lp.ValueArg): - lp.ValueArg( - name='${arg.name}', dtype=np.${arg.dtype.numpy_dtype.name}), - % else: - lp.GlobalArg( - name='${arg.name}', dtype=np.${arg.dtype.numpy_dtype.name}, - shape=${arg.shape}, for_atomic=${arg.for_atomic}), - % endif - % endfor - % for tv in temp_vars: - lp.TemporaryVariable( - name='${tv.name}', dtype=np.${tv.dtype.numpy_dtype.name}, - shape=${tv.shape}, for_atomic=${tv.for_atomic}, - address_space=${tv_scope[tv.address_space]}, - read_only=${tv.read_only}, - % if tv.initializer is not None: - initializer=${"np."+str((tv.initializer).__repr__())}, - % endif - ), - % endfor - ], lang_version=${lp.VERSION})''' - - python_code = Template(python_code).render(insn_x_opts=insn_x_options, - domains=kernel.domains, args=kernel.args, - temp_vars=[k for k in kernel.temporary_variables.values()]) - - python_code = re.sub("\\n ", "\n", python_code) - if filename: - with open(filename, 'w') as f: - f.write(python_code) - else: - print(python_code) - - # vim: foldmethod=marker diff --git a/loopy/transform/__init__.py b/loopy/transform/__init__.py index f42fd3c8d2943bb37b75e9ef0003b88985950926..625781167db6aa502153cdcebd225d79e95c46b6 100644 --- a/loopy/transform/__init__.py +++ b/loopy/transform/__init__.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py index f4a184f632d251bed7ec7d6ace718b3851c5c0d8..1e03ade94710b25cd56eecc7079afdadf567a82c 100644 --- a/loopy/transform/add_barrier.py +++ b/loopy/transform/add_barrier.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2017 Kaushik Kulkarni" __license__ = """ @@ -39,9 +37,8 @@ __doc__ = """ # {{{ add_barrier @iterate_over_kernels_if_given_program -def add_barrier(knl, insn_before="", insn_after="", - id_based_on=None, tags=None, synchronization_kind="global", - mem_kind=None): +def add_barrier(kernel, insn_before="", insn_after="", id_based_on=None, + tags=None, synchronization_kind="global", mem_kind=None): """Takes in a kernel that needs to be added a barrier and returns a kernel which has a barrier inserted into it. It takes input of 2 instructions and then adds a barrier in between those 2 instructions. The expressions can @@ -59,19 +56,19 @@ def add_barrier(knl, insn_before="", insn_after="", for "global" bariers. If not supplied, defaults to *synchronization_kind* """ - assert isinstance(knl, LoopKernel) + assert isinstance(kernel, LoopKernel) if mem_kind is None: mem_kind = synchronization_kind if id_based_on is None: - id = knl.make_unique_instruction_id( + id = kernel.make_unique_instruction_id( based_on=synchronization_kind[0]+"_barrier") else: - id = knl.make_unique_instruction_id(based_on=id_based_on) + id = kernel.make_unique_instruction_id(based_on=id_based_on) match = parse_match(insn_before) - insn_before_list = [insn.id for insn in knl.instructions if match(knl, + insn_before_list = [insn.id for insn in kernel.instructions if match(kernel, insn)] barrier_to_add = BarrierInstruction(depends_on=frozenset(insn_before_list), @@ -81,12 +78,12 @@ def add_barrier(knl, insn_before="", insn_after="", synchronization_kind=synchronization_kind, mem_kind=mem_kind) - new_knl = knl.copy(instructions=knl.instructions + [barrier_to_add]) - new_knl = add_dependency(new_knl, + new_kernel = kernel.copy(instructions=kernel.instructions + [barrier_to_add]) + new_kernel = add_dependency(kernel=new_kernel, insn_match=insn_after, depends_on="id:"+id) - return new_knl + return new_kernel # }}} diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py index 3df86e7ae04073e654f91b30c584719c165269d0..8376688198c3cff232d9f9006883d1b236efe367 100644 --- a/loopy/transform/arithmetic.py +++ b/loopy/transform/arithmetic.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -23,8 +21,6 @@ THE SOFTWARE. """ -import six - from loopy.diagnostic import LoopyError from loopy.program import iterate_over_kernels_if_given_program @@ -42,10 +38,10 @@ def fold_constants(kernel): insn.with_transformed_expressions(cfm) for insn in kernel.instructions] - new_substs = dict( - (sub.name, - sub.copy(expression=cfm(sub.expression))) - for sub in six.itervalues(kernel.substitutions)) + new_substs = { + sub.name: + sub.copy(expression=cfm(sub.expression)) + for sub in kernel.substitutions.values()} return kernel.copy( instructions=new_insns, @@ -80,9 +76,9 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): from loopy.kernel.array import ArrayBase if isinstance(var_descr, ArrayBase): if var_descr.dim_names is not None: - name_to_index = dict( - (name, idx) - for idx, name in enumerate(var_descr.dim_names)) + name_to_index = { + name: idx + for idx, name in enumerate(var_descr.dim_names)} else: name_to_index = {} @@ -146,8 +142,7 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): def iterate_as(cls, expr): if isinstance(expr, cls): - for ch in expr.children: - yield ch + yield from expr.children else: yield expr @@ -222,9 +217,9 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): product_parts = set(iterate_as(Product, term)) - my_common_factors = set( + my_common_factors = { cf for cf in my_common_factors - if unif_subst_map(cf) in product_parts) + if unif_subst_map(cf) in product_parts} common_factors[cf_index] = (index_key, my_common_factors) @@ -269,9 +264,9 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): unif_subst_map = SubstitutionMapper( make_subst_func(unif_result.lmap)) - mapped_my_common_factors = set( + mapped_my_common_factors = { unif_subst_map(cf) - for cf in my_common_factors) + for cf in my_common_factors} new_sum_terms = [] diff --git a/loopy/transform/array_buffer_map.py b/loopy/transform/array_buffer_map.py index b62e13d6b268a9b84e209a5c8958dc949114eecf..4ef5fac77c3af646352e00d595e028223dd9a316 100644 --- a/loopy/transform/array_buffer_map.py +++ b/loopy/transform/array_buffer_map.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012-2015 Andreas Kloeckner" __license__ = """ @@ -22,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from six.moves import range, zip import islpy as isl from islpy import dim_type @@ -195,7 +192,7 @@ def compute_bounds(kernel, domain, stor2sweep, # {{{ array-to-buffer map -class ArrayToBufferMap(object): +class ArrayToBufferMap: def __init__(self, kernel, domain, sweep_inames, access_descriptors, storage_axis_count): self.kernel = kernel @@ -218,8 +215,8 @@ class ArrayToBufferMap(object): self.primed_sweep_inames) self.prime_sweep_inames = SubstitutionMapper(make_subst_func( - dict((sin, var(psin)) - for sin, psin in zip(sweep_inames, self.primed_sweep_inames)))) + {sin: var(psin) + for sin, psin in zip(sweep_inames, self.primed_sweep_inames)})) # # }}} @@ -403,7 +400,7 @@ class ArrayToBufferMap(object): aligned_g_s2s_parm_dom) -class NoOpArrayToBufferMap(object): +class NoOpArrayToBufferMap: non1_storage_axis_names = () storage_base_indices = () non1_storage_shape = () diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index d5a97b773e7447833c96405920e1efad1b382baa..5da142e3d400edf151ee755990d1fa4845aa147e 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -23,10 +21,7 @@ THE SOFTWARE. """ -import six - -from loopy.symbolic import (RuleAwareIdentityMapper, - SubstitutionRuleMappingContext, pw_aff_to_expr) +from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingContext) from loopy.kernel.data import ValueArg, ArrayArg import islpy as isl @@ -37,7 +32,6 @@ __doc__ = """ .. currentmodule:: loopy .. autofunction:: to_batched -.. autofunction:: save_temporaries_in_loop """ @@ -59,15 +53,13 @@ def temp_needs_batching_if_not_sequential(tv, batch_varying_args): class _BatchVariableChanger(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, kernel, batch_varying_args, - batch_iname_expr, sequential, batch_varying_temps=None, within=None): - super(_BatchVariableChanger, self).__init__(rule_mapping_context) + batch_iname_expr, sequential): + super().__init__(rule_mapping_context) self.kernel = kernel self.batch_varying_args = batch_varying_args self.batch_iname_expr = batch_iname_expr self.sequential = sequential - self.batch_varying_temps = batch_varying_temps - self.within = within def needs_batch_subscript(self, name): tv = self.kernel.temporary_variables.get(name) @@ -77,20 +69,15 @@ class _BatchVariableChanger(RuleAwareIdentityMapper): if not self.sequential: if tv is None: return False - if self.batch_varying_temps: - return tv.name in self.batch_varying_temps - else: - if not temp_needs_batching_if_not_sequential(tv, - self.batch_varying_args): - return False + if not temp_needs_batching_if_not_sequential(tv, + self.batch_varying_args): + return False return True def map_subscript(self, expr, expn_state): - if not self.needs_batch_subscript(expr.aggregate.name) or not ( - self.within(expn_state.kernel, expn_state.instruction, - expn_state.stack)): - return super(_BatchVariableChanger, self).map_subscript(expr, expn_state) + if not self.needs_batch_subscript(expr.aggregate.name): + return super().map_subscript(expr, expn_state) idx = self.rec(expr.index, expn_state) if not isinstance(idx, tuple): @@ -99,10 +86,8 @@ class _BatchVariableChanger(RuleAwareIdentityMapper): return type(expr)(expr.aggregate, (self.batch_iname_expr,) + idx) def map_variable(self, expr, expn_state): - if not self.needs_batch_subscript(expr.name) or not ( - self.within(expn_state.kernel, expn_state.instruction, - expn_state.stack)): - return super(_BatchVariableChanger, self).map_variable(expr, expn_state) + if not self.needs_batch_subscript(expr.name): + return super().map_variable(expr, expn_state) return expr[self.batch_iname_expr] @@ -117,8 +102,8 @@ def _add_unique_dim_name(name, dim_names): @iterate_over_kernels_if_given_program -def to_batched(knl, nbatches, batch_varying_args, - batch_iname_prefix="ibatch", sequential=False, within=None): +def to_batched(kernel, nbatches, batch_varying_args, batch_iname_prefix="ibatch", + sequential=False): """Takes in a kernel that carries out an operation and returns a kernel that carries out a batch of these operations. @@ -138,29 +123,29 @@ def to_batched(knl, nbatches, batch_varying_args, from pymbolic import var - vng = knl.get_var_name_generator() + vng = kernel.get_var_name_generator() batch_iname = vng(batch_iname_prefix) batch_iname_expr = var(batch_iname) new_args = [] - batch_dom_str = "{[%(iname)s]: 0 <= %(iname)s < %(nbatches)s}" % { - "iname": batch_iname, - "nbatches": nbatches, - } + batch_dom_str = "{{[{iname}]: 0 <= {iname} < {nbatches}}}".format( + iname=batch_iname, + nbatches=nbatches, + ) if not isinstance(nbatches, int): batch_dom_str = "[%s] -> " % nbatches + batch_dom_str - new_args.append(ValueArg(nbatches, dtype=knl.index_dtype)) + new_args.append(ValueArg(nbatches, dtype=kernel.index_dtype)) nbatches_expr = var(nbatches) else: nbatches_expr = nbatches batch_domain = isl.BasicSet(batch_dom_str) - new_domains = [batch_domain] + knl.domains + new_domains = [batch_domain] + kernel.domains - for arg in knl.args: + for arg in kernel.args: if arg.name in batch_varying_args: if isinstance(arg, ValueArg): arg = ArrayArg(arg.name, arg.dtype, shape=(nbatches_expr,), @@ -173,14 +158,14 @@ def to_batched(knl, nbatches, batch_varying_args, new_args.append(arg) - knl = knl.copy( + kernel = kernel.copy( domains=new_domains, args=new_args) if not sequential: new_temps = {} - for temp in six.itervalues(knl.temporary_variables): + for temp in kernel.temporary_variables.values(): if temp_needs_batching_if_not_sequential(temp, batch_varying_args): new_temps[temp.name] = temp.copy( shape=(nbatches_expr,) + temp.shape, @@ -189,90 +174,28 @@ def to_batched(knl, nbatches, batch_varying_args, else: new_temps[temp.name] = temp - knl = knl.copy(temporary_variables=new_temps) + kernel = kernel.copy(temporary_variables=new_temps) else: import loopy as lp from loopy.kernel.data import ForceSequentialTag - knl = lp.tag_inames(knl, [(batch_iname, ForceSequentialTag())]) - - from loopy.match import parse_stack_match, parse_match + kernel = lp.tag_inames(kernel, [(batch_iname, ForceSequentialTag())]) rule_mapping_context = SubstitutionRuleMappingContext( - knl.substitutions, vng) + kernel.substitutions, vng) bvc = _BatchVariableChanger(rule_mapping_context, - knl, batch_varying_args, batch_iname_expr, - sequential=sequential, within=parse_stack_match(within)) + kernel, batch_varying_args, batch_iname_expr, + sequential=sequential) kernel = rule_mapping_context.finish_kernel( - bvc.map_kernel(knl)) + bvc.map_kernel(kernel)) batch_iname_set = frozenset([batch_iname]) - within = parse_match(within) kernel = kernel.copy( instructions=[ insn.copy(within_inames=insn.within_inames | batch_iname_set) - if within(kernel, insn) else insn for insn in kernel.instructions]) + for insn in kernel.instructions]) return kernel # }}} - -@iterate_over_kernels_if_given_program -def save_temporaries_in_loop(knl, iname, temps_to_save, within=None): - """ - Returns a kernel with the temporary variables in *temps_to_save* batched - within the iname *iname*. - - :arg iname: An instance of :class:`str1 for the loop across which the - values of the temporaries are to be saved. - - :arg temps_to_save: An iterable containing the temporaries that are to be - saved for each loop iteration defined by *iname*. - - :arg within: If not None, limit the action of the transformation to - matching contexts. See :func:`loopy.match.parse_stack_match` - for syntax. - """ - from loopy.match import parse_match, parse_stack_match - from pymbolic import var - from loopy.isl_helpers import static_max_of_pw_aff - - batch_iname_expr = var(iname) - - bounds = knl.get_iname_bounds(iname, constants_only=False) - nbatches_expr = pw_aff_to_expr(static_max_of_pw_aff(bounds.size, - constants_only=False)) - - new_temps = {} - - for temp in six.itervalues(knl.temporary_variables): - if temp.name in temps_to_save: - new_temps[temp.name] = temp.copy( - shape=(nbatches_expr,) + temp.shape, - dim_tags=("c",) * (len(temp.shape) + 1), - dim_names=_add_unique_dim_name("itemp_save", temp.dim_names)) - else: - new_temps[temp.name] = temp - - knl = knl.copy(temporary_variables=new_temps) - - rule_mapping_context = SubstitutionRuleMappingContext( - knl.substitutions, knl.get_var_name_generator) - bvc = _BatchVariableChanger(rule_mapping_context, - knl, [], batch_iname_expr, - sequential=False, batch_varying_temps=temps_to_save, - within=parse_stack_match(within)) - kernel = rule_mapping_context.finish_kernel( - bvc.map_kernel(knl)) - - within = parse_match(within) - - batch_iname_set = frozenset([iname]) - kernel = kernel.copy( - instructions=[ - insn.copy(within_inames=insn.within_inames | batch_iname_set) - if within(kernel, insn) else insn for insn in kernel.instructions]) - - return kernel - # vim: foldmethod=marker diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index a1c90d791a9d4097398610badc421aa4600e2097..7f1ca059acf95f39dfb050c1889149f7a2ed03de 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -1,6 +1,3 @@ -from __future__ import division, absolute_import -from six.moves import range - __copyright__ = "Copyright (C) 2012-2015 Andreas Kloeckner" __license__ = """ @@ -48,7 +45,7 @@ logger = logging.getLogger(__name__) class ArrayAccessReplacer(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, var_name, within, array_base_map, buf_var): - super(ArrayAccessReplacer, self).__init__(rule_mapping_context) + super().__init__(rule_mapping_context) self.within = within @@ -68,7 +65,7 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper): result = self.map_array_access((), expn_state) if result is None: - return super(ArrayAccessReplacer, self).map_variable(expr, expn_state) + return super().map_variable(expr, expn_state) else: self.modified_insn_ids.add(expn_state.insn_id) return result @@ -82,7 +79,7 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper): result = self.map_array_access(expr.index_tuple, expn_state) if result is None: - return super(ArrayAccessReplacer, self).map_subscript(expr, expn_state) + return super().map_subscript(expr, expn_state) else: self.modified_insn_ids.add(expn_state.insn_id) return result @@ -309,8 +306,8 @@ def buffer_array_for_single_kernel(kernel, callables_table, var_name, if isinstance(var_descr, ArrayBase) and var_descr.dim_names is not None: dim_name = var_descr.dim_names[i] - init_iname = var_name_gen("%s_init_%s" % (var_name, dim_name)) - store_iname = var_name_gen("%s_store_%s" % (var_name, dim_name)) + init_iname = var_name_gen(f"{var_name}_init_{dim_name}") + store_iname = var_name_gen(f"{var_name}_store_{dim_name}") new_iname_to_tag[init_iname] = default_tag new_iname_to_tag[store_iname] = default_tag diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 1bbdb12010818d92b989f898ab874b10c5c2a31c..461a4cb5fc4236db4b2dbeea2c8180ce77f308a3 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" __license__ = """ @@ -22,8 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six - import islpy as isl from pymbolic.primitives import CallWithKwargs @@ -63,10 +59,10 @@ def _resolve_callables_from_function_lookup(program, """ callables_table = program.callables_table - callable_knls = dict( - (func_id, in_knl_callable) for func_id, in_knl_callable in + callable_knls = { + func_id: in_knl_callable for func_id, in_knl_callable in callables_table.items() if isinstance(in_knl_callable, - CallableKernel)) + CallableKernel)} edited_callable_knls = {} for func_id, in_knl_callable in callable_knls.items(): @@ -143,7 +139,7 @@ class _RegisterCalleeKernel(ImmutableRecord): :func:`loopy.transform.register_callable_kernel` picklable. As python cannot pickle lexical closures. """ - fields = set(['callable_kernel']) + fields = {"callable_kernel"} def __init__(self, callable_kernel): self.callable_kernel = callable_kernel @@ -255,8 +251,7 @@ def register_callable_kernel(program, callee_kernel): # {{{ sanity checks assert isinstance(program, Program) - assert isinstance(callee_kernel, LoopKernel), ('{0} !=' - '{1}'.format(type(callee_kernel), LoopKernel)) + assert isinstance(callee_kernel, LoopKernel) for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): @@ -328,7 +323,7 @@ class KernelInliner(SubstitutionMapper): """ def __init__(self, subst_func, caller, arg_map, arg_dict): - super(KernelInliner, self).__init__(subst_func) + super().__init__(subst_func) self.caller = caller self.arg_map = arg_map self.arg_dict = arg_dict @@ -352,7 +347,7 @@ class KernelInliner(SubstitutionMapper): from numbers import Integral if not all(isinstance(d, Integral) for d in callee_arg.shape): raise LoopyError( - "Argument: {0} in callee kernel does not have " + "Argument: {} in callee kernel does not have " "constant shape.".format(callee_arg)) flatten_index = 0 @@ -378,7 +373,7 @@ class KernelInliner(SubstitutionMapper): return aggregate.index(tuple(new_indices)) else: - return super(KernelInliner, self).map_subscript(expr) + return super().map_subscript(expr) # }}} @@ -427,7 +422,7 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): temp_map = {} new_temps = kernel.temporary_variables.copy() - for name, temp in six.iteritems(callee_knl.temporary_variables): + for name, temp in callee_knl.temporary_variables.items(): new_name = vng(callee_label+name) temp_map[name] = new_name new_temps[new_name] = temp.copy(name=new_name) @@ -469,11 +464,11 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): import pymbolic.primitives as p from pymbolic.mapper.substitutor import make_subst_func - var_map = dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(iname_map)) - var_map.update(dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(temp_map))) - for k, v in six.iteritems(arg_map): + var_map = {p.Variable(k): p.Variable(v) + for k, v in iname_map.items()} + var_map.update({p.Variable(k): p.Variable(v) + for k, v in temp_map.items()}) + for k, v in arg_map.items(): if isinstance(v, SubArrayRef): var_map[p.Variable(k)] = v.subscript.aggregate else: @@ -490,10 +485,10 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): dep_map = callee_knl.recursive_insn_dep_map() # roots depend on nothing - heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) + heads = {insn for insn, deps in dep_map.items() if not deps} # leaves have nothing that depends on them tails = set(dep_map.keys()) - for insn, deps in six.iteritems(dep_map): + for insn, deps in dep_map.items(): tails = tails - deps # }}} @@ -523,7 +518,7 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): depends_on = frozenset(map(insn_id.get, insn.depends_on)) | ( instruction.depends_on) if insn.id in heads: - depends_on = depends_on | set([noop_start.id]) + depends_on = depends_on | {noop_start.id} new_atomicity = tuple( type(atomicity)(var_map[p.Variable(atomicity.var_name)].name) @@ -663,7 +658,7 @@ class DimChanger(IdentityMapper): def map_subscript(self, expr): if expr.aggregate.name not in self.callee_arg_dict: - return super(DimChanger, self).map_subscript(expr) + return super().map_subscript(expr) callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags flattened_index = sum(dim_tag.stride*idx for dim_tag, idx in zip(callee_arg_dim_tags, expr.index_tuple)) @@ -710,7 +705,7 @@ def _match_caller_callee_argument_dimension_for_single_kernel( get_kw_pos_association) _, pos_to_kw = get_kw_pos_association(callee_knl) arg_id_to_shape = {} - for arg_id, arg in six.iteritems(insn.arg_id_to_val()): + for arg_id, arg in insn.arg_id_to_val().items(): arg_id = pos_to_kw[arg_id] arg_descr = get_arg_descriptor_for_expression(caller_knl, arg) diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 2c9499d9d92d73eeea1ce5344ca8475e60dedbd0..0ed1159446f8f4bd26b480d3e08bd5d7f1c008b7 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -22,8 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six # noqa - from loopy.diagnostic import LoopyError from islpy import dim_type @@ -290,15 +286,15 @@ def add_prefetch_for_single_kernel(kernel, callables_table, var_name, if temporary_name is None: temporary_name = var_name_gen("%s_fetch" % c_name) - arg = kernel.arg_dict[var_name] + var_descr = kernel.get_var_descriptor(var_name) # {{{ make parameter names and unification template parameters = [] - for i in range(arg.num_user_axes()): + for i in range(var_descr.num_user_axes()): based_on = "%s_dim_%d" % (c_name, i) - if arg.dim_names is not None: - based_on = "%s_dim_%s" % (c_name, arg.dim_names[i]) + if var_descr.dim_names is not None: + based_on = "{}_dim_{}".format(c_name, var_descr.dim_names[i]) if dim_arg_names is not None and i < len(dim_arg_names): based_on = dim_arg_names[i] @@ -327,7 +323,7 @@ def add_prefetch_for_single_kernel(kernel, callables_table, var_name, kernel, subst_use, sweep_inames, inames_to_be_removed = \ _process_footprint_subscripts( kernel, rule_name, sweep_inames, - footprint_subscripts, arg) + footprint_subscripts, var_descr) # Our _not_provided is actually a different object from the one in the # precompute module, but precompute acutally uses that to adjust its @@ -336,7 +332,7 @@ def add_prefetch_for_single_kernel(kernel, callables_table, var_name, from loopy.transform.precompute import precompute_for_single_kernel new_kernel = precompute_for_single_kernel(kernel, callables_table, subst_use, sweep_inames, precompute_inames=dim_arg_names, - default_tag=default_tag, dtype=arg.dtype, + default_tag=default_tag, dtype=var_descr.dtype, fetch_bounding_box=fetch_bounding_box, temporary_name=temporary_name, temporary_address_space=temporary_address_space, @@ -398,9 +394,9 @@ def add_prefetch(program, *args, **kwargs): # {{{ change variable kinds -def change_arg_to_image(knl, name): +def change_arg_to_image(kernel, name): new_args = [] - for arg in knl.args: + for arg in kernel.args: if arg.name == name: assert arg.offset == 0 assert arg.shape is not None @@ -408,7 +404,7 @@ def change_arg_to_image(knl, name): else: new_args.append(arg) - return knl.copy(args=new_args) + return kernel.copy(args=new_args) # }}} @@ -416,11 +412,11 @@ def change_arg_to_image(knl, name): # {{{ tag array axes @iterate_over_kernels_if_given_program -def tag_array_axes(knl, ary_names, dim_tags): +def tag_array_axes(kernel, ary_names, dim_tags): """ :arg dim_tags: a tuple of :class:`loopy.kernel.array.ArrayDimImplementationTag` or a string that - parses to one. See :func:`loopy.kernel.array.parse_dim_tags` for a + parses to one. See :func:`loopy.kernel.array.parse_array_dim_tags` for a description of the allowed string format. For example, *dim_tags* could be ``"N2,N0,N1"`` to determine @@ -429,7 +425,7 @@ def tag_array_axes(knl, ary_names, dim_tags): .. versionchanged:: 2016.2 - This function was called :func:`tag_data_axes` before version 2016.2. + This function was called ``tag_data_axes`` before version 2016.2. """ from loopy.kernel.tools import ArrayChanger @@ -438,7 +434,7 @@ def tag_array_axes(knl, ary_names, dim_tags): ary_names = [ary_name.strip() for ary_name in ary_names.split(",")] for ary_name in ary_names: - achng = ArrayChanger(knl, ary_name) + achng = ArrayChanger(kernel, ary_name) ary = achng.get() from loopy.kernel.array import parse_array_dim_tags @@ -449,9 +445,9 @@ def tag_array_axes(knl, ary_names, dim_tags): ary = ary.copy(dim_tags=tuple(new_dim_tags)) - knl = achng.with_changed_array(ary) + kernel = achng.with_changed_array(ary) - return knl + return kernel tag_data_axes = ( @@ -467,7 +463,7 @@ def set_array_axis_names(kernel, ary_names, dim_names): """ .. versionchanged:: 2016.2 - This function was called :func:`set_array_dim_names` before version 2016.2. + This function was called ``set_array_dim_names`` before version 2016.2. """ from loopy.kernel.tools import ArrayChanger if isinstance(ary_names, str): @@ -496,14 +492,14 @@ set_array_dim_names = (MovedFunctionDeprecationWrapper( # {{{ remove_unused_arguments @iterate_over_kernels_if_given_program -def remove_unused_arguments(knl): +def remove_unused_arguments(kernel): new_args = [] import loopy as lp - exp_knl = lp.expand_subst(knl) + exp_kernel = lp.expand_subst(kernel) - refd_vars = set(knl.all_params()) - for insn in exp_knl.instructions: + refd_vars = set(kernel.all_params()) + for insn in exp_kernel.instructions: refd_vars.update(insn.dependency_names()) from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag @@ -515,7 +511,7 @@ def remove_unused_arguments(knl): return set() return get_dependencies(expr) - for ary in chain(knl.args, six.itervalues(knl.temporary_variables)): + for ary in chain(kernel.args, kernel.temporary_variables.values()): if isinstance(ary, ArrayBase): refd_vars.update( tolerant_get_deps(ary.shape) @@ -526,11 +522,11 @@ def remove_unused_arguments(knl): refd_vars.update( tolerant_get_deps(dim_tag.stride)) - for arg in knl.args: + for arg in kernel.args: if arg.name in refd_vars: new_args.append(arg) - return knl.copy(args=new_args) + return kernel.copy(args=new_args) # }}} @@ -538,7 +534,7 @@ def remove_unused_arguments(knl): # {{{ alias_temporaries @iterate_over_kernels_if_given_program -def alias_temporaries(knl, names, base_name_prefix=None, +def alias_temporaries(kernel, names, base_name_prefix=None, synchronize_for_exclusive_use=True): """Sets all temporaries given by *names* to be backed by a single piece of storage. @@ -558,20 +554,20 @@ def alias_temporaries(knl, names, base_name_prefix=None, ``synchronize_for_exclusive_use=True`` was the previous default behavior. """ - gng = knl.get_group_name_generator() + gng = kernel.get_group_name_generator() group_names = [gng("tmpgrp_"+name) for name in names] if base_name_prefix is None: base_name_prefix = "temp_storage" - vng = knl.get_var_name_generator() + vng = kernel.get_var_name_generator() base_name = vng(base_name_prefix) names_set = set(names) if synchronize_for_exclusive_use: new_insns = [] - for insn in knl.instructions: + for insn in kernel.instructions: temp_deps = insn.dependency_names() & names_set if not temp_deps: @@ -598,10 +594,10 @@ def alias_temporaries(knl, names, base_name_prefix=None, conflicts_with_groups=( insn.conflicts_with_groups | other_group_names))) else: - new_insns = knl.instructions + new_insns = kernel.instructions new_temporary_variables = {} - for tv in six.itervalues(knl.temporary_variables): + for tv in kernel.temporary_variables.values(): if tv.name in names_set: if tv.base_storage is not None: raise LoopyError("temporary variable '{tv}' already has " @@ -613,7 +609,7 @@ def alias_temporaries(knl, names, base_name_prefix=None, else: new_temporary_variables[tv.name] = tv - return knl.copy( + return kernel.copy( instructions=new_insns, temporary_variables=new_temporary_variables) @@ -686,7 +682,7 @@ def rename_argument(kernel, old_name, new_name, existing_ok=False): kernel.substitutions, var_name_gen) smap = RuleAwareSubstitutionMapper(rule_mapping_context, make_subst_func(subst_dict), - within=lambda knl, insn, stack: True) + within=lambda kernel, insn, stack: True) kernel = smap.map_kernel(kernel) @@ -710,7 +706,7 @@ def set_temporary_scope(kernel, temp_var_names, scope): :arg temp_var_names: a container with membership checking, or a comma-separated string of variables for which the scope is to be set. - :arg scope: One of the values from :class:`AddressSpace`, or one + :arg scope: One of the values from :class:`loopy.AddressSpace`, or one of the strings ``"private"``, ``"local"``, or ``"global"``. """ @@ -747,15 +743,16 @@ def set_temporary_scope(kernel, temp_var_names, scope): # {{{ reduction_arg_to_subst_rule @iterate_over_kernels_if_given_program -def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=None): +def reduction_arg_to_subst_rule( + kernel, inames, insn_match=None, subst_rule_name=None): if isinstance(inames, str): inames = [s.strip() for s in inames.split(",")] inames_set = frozenset(inames) - substs = knl.substitutions.copy() + substs = kernel.substitutions.copy() - var_name_gen = knl.get_var_name_generator() + var_name_gen = kernel.get_var_name_generator() def map_reduction(expr, rec, nresults=1): if frozenset(expr.inames) != inames_set: @@ -796,13 +793,13 @@ def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=No from loopy.kernel.data import MultiAssignmentBase new_insns = [] - for insn in knl.instructions: + for insn in kernel.instructions: if not isinstance(insn, MultiAssignmentBase): new_insns.append(insn) else: new_insns.append(insn.copy(expression=cb_mapper(insn.expression))) - return knl.copy( + return kernel.copy( instructions=new_insns, substitutions=substs) diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index 33bd519b2d84bf9d64a65214897e9084375dd6a4..5a42973526a6c2c174bc67b76db1a46ebb7f181a 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import, print_function - __copyright__ = "Copyright (C) 2015 Andreas Kloeckner" __license__ = """ @@ -167,7 +165,7 @@ class LoopyDiffMapper(DifferentiationMapper, RuleAwareIdentityMapper): # {{{ diff context -class DifferentiationContext(object): +class DifferentiationContext: def __init__(self, kernel, var_name_gen, by_name, diff_iname_prefix, additional_shape): self.kernel = kernel @@ -369,7 +367,7 @@ class DifferentiationContext(object): # {{{ entrypoint -def diff_kernel(knl, diff_outputs, by, diff_iname_prefix="diff_i", +def diff_kernel(kernel, diff_outputs, by, diff_iname_prefix="diff_i", batch_axes_in_by=frozenset(), copy_outputs=set()): """ @@ -380,25 +378,25 @@ def diff_kernel(knl, diff_outputs, by, diff_iname_prefix="diff_i", *diff_context.by_name*, or *None* if no dependency exists. """ - assert isinstance(knl, LoopKernel) + assert isinstance(kernel, LoopKernel) from loopy.kernel.creation import apply_single_writer_depencency_heuristic - knl = apply_single_writer_depencency_heuristic(knl, warn_if_used=True) + kernel = apply_single_writer_depencency_heuristic(kernel, warn_if_used=True) if isinstance(diff_outputs, str): diff_outputs = [ dout.strip() for dout in diff_outputs.split(",") if dout.strip()] - by_arg = knl.arg_dict[by] + by_arg = kernel.arg_dict[by] additional_shape = by_arg.shape - var_name_gen = knl.get_var_name_generator() + var_name_gen = kernel.get_var_name_generator() # {{{ differentiate instructions diff_context = DifferentiationContext( - knl, var_name_gen, by, diff_iname_prefix=diff_iname_prefix, + kernel, var_name_gen, by, diff_iname_prefix=diff_iname_prefix, additional_shape=additional_shape) result = {} diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 921117f9ed6f5a0c4ca54d04e15e94f25237f3cb..9d4c083889d9ca425da1654edfaf9e848fa6210b 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -23,8 +21,6 @@ THE SOFTWARE. """ -import six - import islpy as isl from islpy import dim_type @@ -55,7 +51,7 @@ def _rename_temporaries(kernel, suffix, all_identifiers): vng = kernel.get_var_name_generator() new_temporaries = {} - for tv in six.itervalues(kernel.temporary_variables): + for tv in kernel.temporary_variables.values(): if tv.name in all_identifiers: new_tv_name = vng(tv.name+suffix) else: @@ -107,7 +103,7 @@ def _ordered_merge_lists(list_a, list_b): def _merge_dicts(item_name, dict_a, dict_b): result = dict_a.copy() - for k, v in six.iteritems(dict_b): + for k, v in dict_b.items(): if k in result: if v != result[k]: raise LoopyError("inconsistent %ss for key '%s' in merge: %s and %s" @@ -131,16 +127,16 @@ def _merge_values(item_name, val_a, val_b): # {{{ two-kernel fusion -def _fuse_two_kernels(knla, knlb): +def _fuse_two_kernels(kernela, kernelb): from loopy.kernel import KernelState - if knla.state != KernelState.INITIAL or knlb.state != KernelState.INITIAL: + if kernela.state != KernelState.INITIAL or kernelb.state != KernelState.INITIAL: raise LoopyError("can only fuse kernels in INITIAL state") # {{{ fuse domains - new_domains = knla.domains[:] + new_domains = kernela.domains[:] - for dom_b in knlb.domains: + for dom_b in kernelb.domains: i_fuse = _find_fusable_loop_domain_index(dom_b, new_domains) if i_fuse is None: new_domains.append(dom_b) @@ -166,14 +162,14 @@ def _fuse_two_kernels(knla, knlb): # }}} - vng = knla.get_var_name_generator() + vng = kernela.get_var_name_generator() b_var_renames = {} # {{{ fuse args - new_args = knla.args[:] - for b_arg in knlb.args: - if b_arg.name not in knla.arg_dict: + new_args = kernela.args[:] + for b_arg in kernelb.args: + if b_arg.name not in kernela.arg_dict: new_arg_name = vng(b_arg.name) if new_arg_name != b_arg.name: @@ -181,21 +177,21 @@ def _fuse_two_kernels(knla, knlb): new_args.append(b_arg.copy(name=new_arg_name)) else: - if b_arg != knla.arg_dict[b_arg.name]: + if b_arg != kernela.arg_dict[b_arg.name]: raise LoopyError( "argument '{arg_name}' has inconsistent definition between " "the two kernels being merged ({arg_a} <-> {arg_b})" .format( arg_name=b_arg.name, - arg_a=str(knla.arg_dict[b_arg.name]), + arg_a=str(kernela.arg_dict[b_arg.name]), arg_b=str(b_arg))) # }}} # {{{ fuse temporaries - new_temporaries = knla.temporary_variables.copy() - for b_name, b_tv in six.iteritems(knlb.temporary_variables): + new_temporaries = kernela.temporary_variables.copy() + for b_name, b_tv in kernelb.temporary_variables.items(): assert b_name == b_tv.name new_tv_name = vng(b_name) @@ -208,18 +204,18 @@ def _fuse_two_kernels(knla, knlb): # }}} - knlb = _apply_renames_in_exprs(knlb, b_var_renames) + kernelb = _apply_renames_in_exprs(kernelb, b_var_renames) from pymbolic.imperative.transform import \ fuse_statement_streams_with_unique_ids new_instructions, old_b_id_to_new_b_id = \ fuse_statement_streams_with_unique_ids( - knla.instructions, knlb.instructions) + kernela.instructions, kernelb.instructions) # {{{ fuse assumptions - assump_a = knla.assumptions - assump_b = knlb.assumptions + assump_a = kernela.assumptions + assump_b = kernelb.assumptions assump_a, assump_b = isl.align_two(assump_a, assump_b) shared_param_names = list( @@ -242,49 +238,49 @@ def _fuse_two_kernels(knla, knlb): domains=new_domains, instructions=new_instructions, args=new_args, - name="%s_and_%s" % (knla.name, knlb.name), - preambles=_ordered_merge_lists(knla.preambles, knlb.preambles), + name=f"{kernela.name}_and_{kernelb.name}", + preambles=_ordered_merge_lists(kernela.preambles, kernelb.preambles), preamble_generators=_ordered_merge_lists( - knla.preamble_generators, knlb.preamble_generators), + kernela.preamble_generators, kernelb.preamble_generators), assumptions=new_assumptions, local_sizes=_merge_dicts( - "local size", knla.local_sizes, knlb.local_sizes), + "local size", kernela.local_sizes, kernelb.local_sizes), temporary_variables=new_temporaries, iname_to_tags=_merge_dicts( "iname-to-tag mapping", - knla.iname_to_tags, - knlb.iname_to_tags), + kernela.iname_to_tags, + kernelb.iname_to_tags), substitutions=_merge_dicts( "substitution", - knla.substitutions, - knlb.substitutions), + kernela.substitutions, + kernelb.substitutions), function_manglers=_ordered_merge_lists( - knla.function_manglers, - knlb.function_manglers), + kernela.function_manglers, + kernelb.function_manglers), symbol_manglers=_ordered_merge_lists( - knla.symbol_manglers, - knlb.symbol_manglers), + kernela.symbol_manglers, + kernelb.symbol_manglers), iname_slab_increments=_merge_dicts( "iname slab increment", - knla.iname_slab_increments, - knlb.iname_slab_increments), - loop_priority=knla.loop_priority.union(knlb.loop_priority), + kernela.iname_slab_increments, + kernelb.iname_slab_increments), + loop_priority=kernela.loop_priority.union(kernelb.loop_priority), silenced_warnings=_ordered_merge_lists( - knla.silenced_warnings, - knlb.silenced_warnings), + kernela.silenced_warnings, + kernelb.silenced_warnings), applied_iname_rewrites=_ordered_merge_lists( - knla.applied_iname_rewrites, - knlb.applied_iname_rewrites), + kernela.applied_iname_rewrites, + kernelb.applied_iname_rewrites), index_dtype=_merge_values( "index dtype", - knla.index_dtype, - knlb.index_dtype), + kernela.index_dtype, + kernelb.index_dtype), target=_merge_values( "target", - knla.target, - knlb.target), - options=knla.options), old_b_id_to_new_b_id + kernela.target, + kernelb.target), + options=kernela.options), old_b_id_to_new_b_id # }}} @@ -375,19 +371,19 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): kernel_insn_ids = [] result = None - for knlb in kernels: + for kernelb in kernels: if result is None: - result = knlb + result = kernelb kernel_insn_ids.append([ - insn.id for insn in knlb.instructions]) + insn.id for insn in kernelb.instructions]) else: result, old_b_id_to_new_b_id = _fuse_two_kernels( - knla=result, - knlb=knlb) + kernela=result, + kernelb=kernelb) kernel_insn_ids.append([ old_b_id_to_new_b_id[insn.id] - for insn in knlb.instructions]) + for insn in kernelb.instructions]) # {{{ realize data_flow dependencies diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 584aca6a4f6914e61ff50b593c500e342bf495fd..473dbbca7a69816836b13d5496562656e1f03a72 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -23,9 +21,6 @@ THE SOFTWARE. """ -import six -from six.moves import zip - import islpy as isl from islpy import dim_type @@ -76,6 +71,8 @@ __doc__ = """ .. autofunction:: add_inames_to_insn +.. autofunction:: add_inames_for_unused_hw_axes + """ @@ -129,7 +126,7 @@ def prioritize_loops(kernel, loop_priority): class _InameSplitter(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, within, split_iname, outer_iname, inner_iname, replacement_index): - super(_InameSplitter, self).__init__(rule_mapping_context) + super().__init__(rule_mapping_context) self.within = within @@ -154,7 +151,7 @@ class _InameSplitter(RuleAwareIdentityMapper): self.rec(expr.expr, expn_state), expr.allow_simultaneous) else: - return super(_InameSplitter, self).map_reduction(expr, expn_state) + return super().map_reduction(expr, expn_state) def map_variable(self, expr, expn_state): if (expr.name == self.split_iname @@ -164,7 +161,7 @@ class _InameSplitter(RuleAwareIdentityMapper): expn_state.instruction)): return self.replacement_index else: - return super(_InameSplitter, self).map_variable(expr, expn_state) + return super().map_variable(expr, expn_state) def _split_iname_backend(kernel, split_iname, @@ -474,7 +471,7 @@ def chunk_iname(kernel, split_iname, num_chunks, class _InameJoiner(RuleAwareSubstitutionMapper): def __init__(self, rule_mapping_context, within, subst_func, joined_inames, new_iname): - super(_InameJoiner, self).__init__(rule_mapping_context, + super().__init__(rule_mapping_context, subst_func, within) self.joined_inames = set(joined_inames) @@ -505,7 +502,7 @@ class _InameJoiner(RuleAwareSubstitutionMapper): self.rec(expr.expr, expn_state), expr.allow_simultaneous) else: - return super(_InameJoiner, self).map_reduction(expr, expn_state) + return super().map_reduction(expr, expn_state) @iterate_over_kernels_if_given_program @@ -655,7 +652,7 @@ def untag_inames(kernel, iname_to_untag, tag_type): knl_iname_to_tags = kernel.iname_to_tags.copy() old_tags = knl_iname_to_tags.get(iname_to_untag, frozenset()) - old_tags = set(tag for tag in old_tags if not isinstance(tag, tag_type)) + old_tags = {tag for tag in old_tags if not isinstance(tag, tag_type)} if old_tags: knl_iname_to_tags[iname_to_untag] = old_tags @@ -703,7 +700,7 @@ def tag_inames(kernel, iname_to_tag, force=False, # convert dict to list of tuples if isinstance(iname_to_tag, dict): - iname_to_tag = list(six.iteritems(iname_to_tag)) + iname_to_tag = list(iname_to_tag.items()) # flatten iterables of tags for each iname @@ -752,7 +749,7 @@ def tag_inames(kernel, iname_to_tag, force=False, from loopy.match import re_from_glob new_iname_to_tag = {} for iname, new_tag in iname_to_tag: - if '*' in iname or '?' in iname: + if "*" in iname or "?" in iname: match_re = re_from_glob(iname) for sub_iname in all_inames: if match_re.match(sub_iname): @@ -773,7 +770,7 @@ def tag_inames(kernel, iname_to_tag, force=False, # }}} knl_iname_to_tags = kernel.iname_to_tags.copy() - for iname, new_tag in six.iteritems(iname_to_tag): + for iname, new_tag in iname_to_tag.items(): if not new_tag: continue @@ -806,10 +803,10 @@ def tag_inames(kernel, iname_to_tag, force=False, class _InameDuplicator(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, old_to_new, within): - super(_InameDuplicator, self).__init__(rule_mapping_context) + super().__init__(rule_mapping_context) self.old_to_new = old_to_new - self.old_inames_set = set(six.iterkeys(old_to_new)) + self.old_inames_set = set(old_to_new.keys()) self.within = within def map_reduction(self, expr, expn_state): @@ -829,7 +826,7 @@ class _InameDuplicator(RuleAwareIdentityMapper): self.rec(expr.expr, expn_state), expr.allow_simultaneous) else: - return super(_InameDuplicator, self).map_reduction(expr, expn_state) + return super().map_reduction(expr, expn_state) def map_variable(self, expr, expn_state): new_name = self.old_to_new.get(expr.name) @@ -840,7 +837,7 @@ class _InameDuplicator(RuleAwareIdentityMapper): expn_state.kernel, expn_state.instruction, expn_state.stack)): - return super(_InameDuplicator, self).map_variable(expr, expn_state) + return super().map_variable(expr, expn_state) else: from pymbolic import var return var(new_name) @@ -856,8 +853,7 @@ class _InameDuplicator(RuleAwareIdentityMapper): @iterate_over_kernels_if_given_program -def duplicate_inames(knl, inames, within, new_inames=None, - suffix=None, +def duplicate_inames(kernel, inames, within, new_inames=None, suffix=None, tags={}): """ :arg within: a stack match as understood by @@ -881,7 +877,7 @@ def duplicate_inames(knl, inames, within, new_inames=None, if len(new_inames) != len(inames): raise ValueError("new_inames must have the same number of entries as inames") - name_gen = knl.get_var_name_generator() + name_gen = kernel.get_var_name_generator() for i, iname in enumerate(inames): new_iname = new_inames[i] @@ -909,10 +905,10 @@ def duplicate_inames(knl, inames, within, new_inames=None, for old_iname, new_iname in zip(inames, new_inames): from loopy.kernel.tools import DomainChanger - domch = DomainChanger(knl, frozenset([old_iname])) + domch = DomainChanger(kernel, frozenset([old_iname])) from loopy.isl_helpers import duplicate_axes - knl = knl.copy( + kernel = kernel.copy( domains=domch.get_domains_with( duplicate_axes(domch.domain, [old_iname], [new_iname]))) @@ -921,13 +917,13 @@ def duplicate_inames(knl, inames, within, new_inames=None, # {{{ change the inames in the code rule_mapping_context = SubstitutionRuleMappingContext( - knl.substitutions, name_gen) + kernel.substitutions, name_gen) indup = _InameDuplicator(rule_mapping_context, old_to_new=dict(list(zip(inames, new_inames))), within=within) - knl = rule_mapping_context.finish_kernel( - indup.map_kernel(knl)) + kernel = rule_mapping_context.finish_kernel( + indup.map_kernel(kernel)) # }}} @@ -936,11 +932,11 @@ def duplicate_inames(knl, inames, within, new_inames=None, for old_iname, new_iname in zip(inames, new_inames): new_tag = tags.get(old_iname) if new_tag is not None: - knl = tag_inames(knl, {new_iname: new_tag}) + kernel = tag_inames(kernel, {new_iname: new_tag}) # }}} - return knl + return kernel # }}} @@ -963,8 +959,7 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset( common = common.union(old_common_inames) # Go into recursion - for option in _get_iname_duplication_options(insn_iname_sets, common): - yield option + yield from _get_iname_duplication_options(insn_iname_sets, common) # Do not yield anything beyond here! return @@ -991,9 +986,8 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset( if len(partitioning) > 1: for part in partitioning: working_set = frozenset(s for s in insn_iname_sets if s <= part) - for option in _get_iname_duplication_options(working_set, - old_common_inames): - yield option + yield from _get_iname_duplication_options(working_set, + old_common_inames) # If exactly one set was found, an iname duplication is necessary elif len(partitioning) == 1: inames, = partitioning @@ -1010,8 +1004,8 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset( # is inspected. For each element of the power set without the # empty and the full set, one duplication option is generated. for insns_to_dup in it.chain.from_iterable( - it.combinations(iname_insns, l) - for l in range(1, len(iname_insns))): + it.combinations(iname_insns, i) + for i in range(1, len(iname_insns))): yield ( iname, tuple(insn | old_common_inames for insn in insns_to_dup)) @@ -1019,7 +1013,8 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset( # If partitioning was empty, we have recursed successfully and yield nothing -def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=False): +def get_iname_duplication_options_for_single_kernel(kernel, + use_boostable_into=False): """List options for duplication of inames, if necessary for schedulability :returns: a generator listing all options to duplicate inames, if duplication @@ -1049,66 +1044,45 @@ def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=Fals Use :func:`has_schedulable_iname_nesting` to decide whether an iname needs to be duplicated in a given kernel. """ + if use_boostable_into: + raise LoopyError("'use_boostable_into=True' is no longer supported.") + + if use_boostable_into is False: + from warnings import warn + warn("passing 'use_boostable_into=False' to 'get_iname_duplication_options'" + " is deprecated. The argument will go away in 2021.", + DeprecationWarning, stacklevel=2) + from loopy.kernel.data import ConcurrentTag - concurrent_inames = set( + concurrent_inames = { iname - for iname in knl.all_inames() - if knl.iname_tags_of_type(iname, ConcurrentTag)) + for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, ConcurrentTag)} # First we extract the minimal necessary information from the kernel - if use_boostable_into: - insn_iname_sets = ( - frozenset( - (insn.within_inames - | insn.boostable_into if insn.boostable_into is not None - else frozenset([])) - - concurrent_inames - for insn in knl.instructions) - - - frozenset([frozenset([])])) - else: - insn_iname_sets = ( - frozenset( - insn.within_inames - concurrent_inames - for insn in knl.instructions) - - - frozenset([frozenset([])])) + insn_iname_sets = ( + frozenset( + insn.within_inames - concurrent_inames + for insn in kernel.instructions) + - + frozenset([frozenset([])])) # Get the duplication options as a tuple of iname and a set for iname, insns in _get_iname_duplication_options(insn_iname_sets): # Check whether this iname has a parallel tag and discard it if so - if (iname in knl.iname_to_tags - and knl.iname_tags_of_type(iname, ConcurrentTag)): + if (iname in kernel.iname_to_tags + and kernel.iname_tags_of_type(iname, ConcurrentTag)): continue - # If we find a duplication option and to not use boostable_into - # information, we restart this generator with use_boostable_into=True - if not use_boostable_into and not knl.options.ignore_boostable_into: - for option in get_iname_duplication_options_for_single_kernel(knl, True): - yield option - - # Emit a warning that we needed boostable_into - from warnings import warn - from loopy.diagnostic import LoopyWarning - warn("Kernel '%s' required the deprecated 'boostable_into' " - "instruction attribute in order to be schedulable!" % knl.name, - LoopyWarning) - - # Return to avoid yielding the duplication - # options without boostable_into - return - # Reconstruct an object that may be passed to the within parameter of # loopy.duplicate_inames from loopy.match import Id, Or within = Or(tuple( - Id(insn.id) for insn in knl.instructions + Id(insn.id) for insn in kernel.instructions if insn.within_inames in insns)) - # Only yield the result if an instruction matched. With - # use_boostable_into=True this is not always true. - + # Only yield the result if an instruction matched. if within.children: yield iname, within @@ -1116,9 +1090,8 @@ def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=Fals def get_iname_duplication_options(program, use_boostable_into=False): for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): - for option in get_iname_duplication_options_for_single_kernel( - in_knl_callable.subkernel, use_boostable_into): - yield option + yield from get_iname_duplication_options_for_single_kernel( + in_knl_callable.subkernel, use_boostable_into) elif isinstance(in_knl_callable, ScalarCallable): pass else: @@ -1128,12 +1101,12 @@ def get_iname_duplication_options(program, use_boostable_into=False): return -def has_schedulable_iname_nesting_for_single_kernel(knl): +def has_schedulable_iname_nesting_for_single_kernel(kernel): """ :returns: a :class:`bool` indicating whether this kernel needs an iname duplication in order to be schedulable. """ - return not bool(next(get_iname_duplication_options_for_single_kernel(knl), + return not bool(next(get_iname_duplication_options_for_single_kernel(kernel), False)) @@ -1149,19 +1122,19 @@ def has_schedulable_iname_nesting(program): # {{{ rename_inames @iterate_over_kernels_if_given_program -def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None): +def rename_iname(kernel, old_iname, new_iname, existing_ok=False, within=None): """ :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. :arg existing_ok: execute even if *new_iname* already exists """ - var_name_gen = knl.get_var_name_generator() + var_name_gen = kernel.get_var_name_generator() # FIXME: Distinguish existing iname vs. existing other variable does_exist = var_name_gen.is_name_conflicting(new_iname) - if old_iname not in knl.all_inames(): + if old_iname not in kernel.all_inames(): raise LoopyError("old iname '%s' does not exist" % old_iname) if does_exist and not existing_ok: @@ -1171,7 +1144,7 @@ def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None): if does_exist: # {{{ check that the domains match up - dom = knl.get_inames_domain(frozenset((old_iname, new_iname))) + dom = kernel.get_inames_domain(frozenset((old_iname, new_iname))) var_dict = dom.get_var_dict() _, old_idx = var_dict[old_iname] @@ -1208,17 +1181,17 @@ def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None): from pymbolic.mapper.substitutor import make_subst_func rule_mapping_context = SubstitutionRuleMappingContext( - knl.substitutions, var_name_gen) + kernel.substitutions, var_name_gen) smap = RuleAwareSubstitutionMapper(rule_mapping_context, make_subst_func(subst_dict), within) - knl = rule_mapping_context.finish_kernel( - smap.map_kernel(knl)) + kernel = rule_mapping_context.finish_kernel( + smap.map_kernel(kernel)) new_instructions = [] - for insn in knl.instructions: + for insn in kernel.instructions: if (old_iname in insn.within_inames - and within(knl, insn, ())): + and within(kernel, insn, ())): insn = insn.copy( within_inames=( (insn.within_inames - frozenset([old_iname])) @@ -1226,22 +1199,35 @@ def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None): new_instructions.append(insn) - knl = knl.copy(instructions=new_instructions) + kernel = kernel.copy(instructions=new_instructions) else: - knl = duplicate_inames( - knl, [old_iname], within=within, new_inames=[new_iname]) + kernel = duplicate_inames( + kernel, [old_iname], within=within, new_inames=[new_iname]) - knl = remove_unused_inames(knl, [old_iname]) + kernel = remove_unused_inames(kernel, [old_iname]) - return knl + return kernel # }}} # {{{ remove unused inames -def remove_unused_inames(knl, inames=None): +def get_used_inames(kernel): + import loopy as lp + exp_kernel = lp.expand_subst(kernel) + + used_inames = set() + for insn in exp_kernel.instructions: + used_inames.update( + exp_kernel.insn_inames(insn.id) + | insn.reduction_inames()) + + return used_inames + + +def remove_unused_inames(kernel, inames=None): """Delete those among *inames* that are unused, i.e. project them out of the domain. If these inames pose implicit restrictions on other inames, these restrictions will persist as existentially @@ -1253,7 +1239,7 @@ def remove_unused_inames(knl, inames=None): # {{{ normalize arguments if inames is None: - inames = knl.all_inames() + inames = kernel.all_inames() elif isinstance(inames, str): inames = inames.split(",") @@ -1261,17 +1247,7 @@ def remove_unused_inames(knl, inames=None): # {{{ check which inames are unused - import loopy as lp - exp_knl = lp.expand_subst(knl) - - inames = set(inames) - used_inames = set() - for insn in exp_knl.instructions: - used_inames.update( - exp_knl.insn_inames(insn.id) - | insn.reduction_inames()) - - unused_inames = inames - used_inames + unused_inames = set(inames) - get_used_inames(kernel) # }}} @@ -1280,17 +1256,44 @@ def remove_unused_inames(knl, inames=None): from loopy.kernel.tools import DomainChanger for iname in unused_inames: - domch = DomainChanger(knl, (iname,)) + domch = DomainChanger(kernel, (iname,)) dom = domch.domain dt, idx = dom.get_var_dict()[iname] dom = dom.project_out(dt, idx, 1) - knl = knl.copy(domains=domch.get_domains_with(dom)) + kernel = kernel.copy(domains=domch.get_domains_with(dom)) # }}} - return knl + return kernel + + +def remove_any_newly_unused_inames(transformation_func): + from functools import wraps + + @wraps(transformation_func) + def wrapper(kernel, *args, **kwargs): + + # check for remove_unused_inames argument, default: True + remove_newly_unused_inames = kwargs.pop("remove_newly_unused_inames", True) + + if remove_newly_unused_inames: + # determine which inames were already unused + inames_already_unused = kernel.all_inames() - get_used_inames(kernel) + + # call transform + transformed_kernel = transformation_func(kernel, *args, **kwargs) + + # Remove inames that are unused due to transform + return remove_unused_inames( + transformed_kernel, + transformed_kernel.all_inames()-inames_already_unused) + else: + # call transform + return transformation_func(kernel, *args, **kwargs) + + return wrapper # }}} @@ -1299,7 +1302,7 @@ def remove_unused_inames(knl, inames=None): class _ReductionSplitter(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, within, inames, direction): - super(_ReductionSplitter, self).__init__( + super().__init__( rule_mapping_context) self.within = within @@ -1334,7 +1337,7 @@ class _ReductionSplitter(RuleAwareIdentityMapper): else: assert False else: - return super(_ReductionSplitter, self).map_reduction(expr, expn_state) + return super().map_reduction(expr, expn_state) def _split_reduction(kernel, inames, direction, within=None): @@ -1475,9 +1478,9 @@ def affine_map_inames(kernel, old_inames, new_inames, equations): from pymbolic.algorithm import solve_affine_equations_for old_inames_to_expr = solve_affine_equations_for(old_inames, equations) - subst_dict = dict( - (v.name, expr) - for v, expr in old_inames_to_expr.items()) + subst_dict = { + v.name: expr + for v, expr in old_inames_to_expr.items()} var_name_gen = kernel.get_var_name_generator() @@ -1533,9 +1536,9 @@ def affine_map_inames(kernel, old_inames, new_inames, equations): if dom_old_inames: dom_equations.append((lhs, rhs)) - this_eqn_old_iname_dim_types = set( + this_eqn_old_iname_dim_types = { dom_var_dict[old_iname][0] - for old_iname in eqn_deps & old_inames_set) + for old_iname in eqn_deps & old_inames_set} if this_eqn_old_iname_dim_types: if len(this_eqn_old_iname_dim_types) > 1: @@ -1621,9 +1624,9 @@ def find_unused_axis_tag(kernel, kind, insn_match=None): :func:`loopy.match.parse_match`. :arg kind: may be "l" or "g", or the corresponding tag class name - :returns: an :class:`GroupIndexTag` or :class:`LocalIndexTag` - that is not being used within the instructions matched by - *insn_match*. + :returns: an :class:`loopy.kernel.data.GroupIndexTag` or + :class:`loopy.kernel.data.LocalIndexTag` that is not being used within + the instructions matched by *insn_match*. """ used_axes = set() @@ -1679,7 +1682,7 @@ def separate_loop_head_tail_slab(kernel, iname, head_it_count, tail_it_count): class _ReductionInameUniquifier(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, inames, within): - super(_ReductionInameUniquifier, self).__init__(rule_mapping_context) + super().__init__(rule_mapping_context) self.inames = inames self.old_to_new = [] @@ -1731,7 +1734,7 @@ class _ReductionInameUniquifier(RuleAwareIdentityMapper): expn_state), expr.allow_simultaneous) else: - return super(_ReductionInameUniquifier, self).map_reduction( + return super().map_reduction( expr, expn_state) @@ -1783,7 +1786,7 @@ def make_reduction_inames_unique(kernel, inames=None, within=None): # {{{ add_inames_to_insn @iterate_over_kernels_if_given_program -def add_inames_to_insn(knl, inames, insn_match): +def add_inames_to_insn(kernel, inames, insn_match): """ :arg inames: a frozenset of inames that will be added to the instructions matched by *insn_match*, or a comma-separated @@ -1791,9 +1794,9 @@ def add_inames_to_insn(knl, inames, insn_match): :arg insn_match: An instruction match as understood by :func:`loopy.match.parse_match`. - :returns: an :class:`GroupIndexTag` or :class:`LocalIndexTag` - that is not being used within the instructions matched by - *insn_match*. + :returns: an :class:`loopy.kernel.data.GroupIndexTag` or + :class:`loopy.kernel.data.LocalIndexTag` that is not being used within + the instructions matched by *insn_match*. .. versionadded:: 2016.3 """ @@ -1809,16 +1812,125 @@ def add_inames_to_insn(knl, inames, insn_match): new_instructions = [] - for insn in knl.instructions: - if match(knl, insn): + for insn in kernel.instructions: + if match(kernel, insn): new_instructions.append( insn.copy(within_inames=insn.within_inames | inames)) else: new_instructions.append(insn) - return knl.copy(instructions=new_instructions) + return kernel.copy(instructions=new_instructions) # }}} +def add_inames_for_unused_hw_axes(kernel, within=None): + """ + Returns a kernel with inames added to each instruction + corresponding to any hardware-parallel iname tags + (:class:`loopy.kernel.data.GroupIndexTag`, + :class:`loopy.kernel.data.LocalIndexTag`) unused + in the instruction but used elsewhere in the kernel. + + Current limitations: + + * Only one iname in the kernel may be tagged with each of the unused hw axes. + * Occurence of an ``l.auto`` tag when an instruction is missing one of the + local hw axes. + + :arg within: An instruction match as understood by + :func:`loopy.match.parse_match`. + """ + from loopy.kernel.data import (LocalIndexTag, GroupIndexTag, + AutoFitLocalIndexTag) + + n_local_axes = max([tag.axis + for tags in kernel.iname_to_tags.values() + for tag in tags + if isinstance(tag, LocalIndexTag)], + default=-1) + 1 + + n_group_axes = max([tag.axis + for tags in kernel.iname_to_tags.values() + for tag in tags + if isinstance(tag, GroupIndexTag)], + default=-1) + 1 + + contains_auto_local_tag = any([isinstance(tag, AutoFitLocalIndexTag) + for tags in kernel.iname_to_tags + for tag in tags]) + + if contains_auto_local_tag: + raise LoopyError("Kernels containing l.auto tags are invalid" + " arguments.") + + # {{{ fill axes_to_inames + + # local_axes_to_inames: ith entry contains the iname tagged with l.i or None + # if multiple inames are tagged with l.i + local_axes_to_inames = [] + # group_axes_to_inames: ith entry contains the iname tagged with g.i or None + # if multiple inames are tagged with g.i + group_axes_to_inames = [] + + for i in range(n_local_axes): + ith_local_axes_tag = LocalIndexTag(i) + inames = [iname + for iname, tags in kernel.iname_to_tags.items() + if ith_local_axes_tag in tags] + if not inames: + raise LoopyError(f"Unused local hw axes {i}.") + + local_axes_to_inames.append(inames[0] if len(inames) == 1 else None) + + for i in range(n_group_axes): + ith_group_axes_tag = GroupIndexTag(i) + inames = [iname + for iname, tags in kernel.iname_to_tags.items() + if ith_group_axes_tag in tags] + if not inames: + raise LoopyError(f"Unused group hw axes {i}.") + + group_axes_to_inames.append(inames[0] if len(inames) == 1 else None) + + # }}} + + from loopy.match import parse_match + within = parse_match(within) + + new_insns = [] + + for insn in kernel.instructions: + if within(kernel, insn): + within_tags = frozenset().union(*(kernel.iname_to_tags.get(iname, + frozenset()) for iname in insn.within_inames)) + missing_local_axes = [i for i in range(n_local_axes) + if LocalIndexTag(i) not in within_tags] + missing_group_axes = [i for i in range(n_group_axes) + if GroupIndexTag(i) not in within_tags] + + for axis in missing_local_axes: + iname = local_axes_to_inames[axis] + if iname: + insn = insn.copy(within_inames=insn.within_inames | + frozenset([iname])) + else: + raise LoopyError("Multiple inames tagged with l.%d while" + " adding unused local hw axes to instruction '%s'." + % (axis, insn.id)) + + for axis in missing_group_axes: + iname = group_axes_to_inames[axis] + if iname is not None: + insn = insn.copy(within_inames=insn.within_inames | + frozenset([iname])) + else: + raise LoopyError("Multiple inames tagged with g.%d while" + " adding unused group hw axes to instruction '%s'." + % (axis, insn.id)) + + new_insns.append(insn) + + return kernel.copy(instructions=new_insns) + # vim: foldmethod=marker diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index f73110ecdff79d7c029c0dd0d895ef71ea68326b..c84c1b9c69fc833877d42daf4c83b7dce5af3d4e 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -22,8 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six # noqa - from loopy.diagnostic import LoopyError from loopy.kernel import LoopKernel from loopy.kernel.function_interface import (ScalarCallable, CallableKernel) @@ -362,9 +358,9 @@ def uniquify_instruction_ids(kernel): from loopy.kernel.creation import UniqueName - insn_ids = set( + insn_ids = { insn.id for insn in kernel.instructions - if insn.id is not None and not isinstance(insn.id, UniqueName)) + if insn.id is not None and not isinstance(insn.id, UniqueName)} from pytools import UniqueNameGenerator insn_id_gen = UniqueNameGenerator(insn_ids) diff --git a/loopy/transform/make_scalar.py b/loopy/transform/make_scalar.py index d0e7d1bc2ec5d1b5815ec8c8c30fecc198014c86..b8db7f43f90a5a1203dea470c9a0ba6f8fa21cae 100644 --- a/loopy/transform/make_scalar.py +++ b/loopy/transform/make_scalar.py @@ -7,13 +7,13 @@ from loopy.transform.iname import remove_unused_inames class ScalarChanger(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, var_name): self.var_name = var_name - super(ScalarChanger, self).__init__(rule_mapping_context) + super().__init__(rule_mapping_context) def map_subscript(self, expr, expn_state): if expr.aggregate.name == self.var_name: return Variable(self.var_name) - return super(ScalarChanger, self).map_subscript(expr, expn_state) + return super().map_subscript(expr, expn_state) def make_scalar(kernel, var_name): diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index a18326187379cac0b4be46bbfe244bcc2d9e7684..6fb4988f0f8d82b5a1169f92a52c7eb649a861e1 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2018 Tianjiao Sun, Kaushik Kulkarni" __license__ = """ @@ -121,9 +119,9 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel, from pymbolic import var dim_type = isl.dim_type.set - ilp_inames = set(iname for iname in insn.within_inames + ilp_inames = {iname for iname in insn.within_inames if all(isinstance(tag, (IlpBaseTag, VectorizeTag)) - for tag in kernel.iname_to_tags.get(iname, []))) + for tag in kernel.iname_to_tags.get(iname, []))} new_ilp_inames = set() ilp_inames_map = {} for iname in ilp_inames: @@ -156,10 +154,10 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel, new_pack_inames = ilp_inames_map.copy() # packing-specific inames new_unpack_inames = ilp_inames_map.copy() # unpacking-specific iname - new_pack_inames = dict((iname, var(vng(iname.name + - "_pack"))) for iname in p.swept_inames) - new_unpack_inames = dict((iname, var(vng(iname.name + - "_unpack"))) for iname in p.swept_inames) + new_pack_inames = {iname: var(vng(iname.name + + "_pack")) for iname in p.swept_inames} + new_unpack_inames = {iname: var(vng(iname.name + + "_unpack")) for iname in p.swept_inames} # Updating the domains corresponding to the new inames. for iname in p.swept_inames: @@ -228,8 +226,8 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel, packing_insns.append(Assignment( assignee=pack_lhs_assignee, expression=pack_subst_mapper.map_subscript(p.subscript), - within_inames=insn.within_inames - ilp_inames | set( - new_pack_inames[i].name for i in p.swept_inames) | ( + within_inames=insn.within_inames - ilp_inames | { + new_pack_inames[i].name for i in p.swept_inames} | ( new_ilp_inames), depends_on=insn.depends_on, id=ing(insn.id+"_pack"), @@ -240,8 +238,8 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel, unpacking_insns.append(Assignment( expression=unpack_rhs, assignee=unpack_subst_mapper.map_subscript(p.subscript), - within_inames=insn.within_inames - ilp_inames | set( - new_unpack_inames[i].name for i in p.swept_inames) | ( + within_inames=insn.within_inames - ilp_inames | { + new_unpack_inames[i].name for i in p.swept_inames} | ( new_ilp_inames), id=ing(insn.id+"_unpack"), depends_on=frozenset([insn.id]), @@ -282,8 +280,8 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel, new_assignees = tuple(subst_mapper(new_id_to_parameters[-i-1]) for i, _ in enumerate(insn.assignees)) new_call_insn = new_call_insn.copy( - depends_on=new_call_insn.depends_on | set( - pack.id for pack in packing_insns), + depends_on=new_call_insn.depends_on | { + pack.id for pack in packing_insns}, within_inames=new_call_insn.within_inames - ilp_inames | ( new_ilp_inames), expression=new_call_insn.expression.function(*new_params), diff --git a/loopy/transform/padding.py b/loopy/transform/padding.py index 2ee3bd9b153907b564f4ca25c4c3720a6910d509..1e267321596d7e551645200e117055378c7c5c1e 100644 --- a/loopy/transform/padding.py +++ b/loopy/transform/padding.py @@ -1,7 +1,3 @@ -from __future__ import division -from __future__ import absolute_import -import six - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -34,7 +30,7 @@ from loopy.kernel import LoopKernel class ArrayAxisSplitHelper(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, arg_names, handler): - super(ArrayAxisSplitHelper, self).__init__(rule_mapping_context) + super().__init__(rule_mapping_context) self.arg_names = arg_names self.handler = handler @@ -42,7 +38,7 @@ class ArrayAxisSplitHelper(RuleAwareIdentityMapper): if expr.aggregate.name in self.arg_names: return self.handler(expr) else: - return super(ArrayAxisSplitHelper, self).map_subscript(expr, expn_state) + return super().map_subscript(expr, expn_state) # {{{ split_array_dim (deprecated since June 2016) @@ -93,8 +89,8 @@ def split_array_dim(kernel, arrays_and_axes, count, if isinstance(arrays_and_axes, tuple): arrays_and_axes = [arrays_and_axes] - array_to_rest = dict( - (tup[0], normalize_rest(tup[1:])) for tup in arrays_and_axes) + array_to_rest = { + tup[0]: normalize_rest(tup[1:]) for tup in arrays_and_axes} if len(arrays_and_axes) != len(array_to_rest): raise RuntimeError("cannot split multiple axes of the same variable") @@ -107,7 +103,7 @@ def split_array_dim(kernel, arrays_and_axes, count, from loopy.kernel.tools import ArrayChanger - for array_name, (axis, order) in six.iteritems(array_to_rest): + for array_name, (axis, order) in array_to_rest.items(): achng = ArrayChanger(kernel, array_name) ary = achng.get() @@ -238,12 +234,12 @@ def split_array_dim(kernel, arrays_and_axes, count, rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, var_name_gen) aash = ArrayAxisSplitHelper(rule_mapping_context, - set(six.iterkeys(array_to_rest)), split_access_axis) + set(array_to_rest.keys()), split_access_axis) kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel)) if auto_split_inames: - from loopy.transform.iname import split_iname - for iname, (outer_iname, inner_iname) in six.iteritems(split_vars): + from loopy import split_iname + for iname, (outer_iname, inner_iname) in split_vars.items(): kernel = split_iname(kernel, iname, count, outer_iname=outer_iname, inner_iname=inner_iname, **split_kwargs) @@ -369,7 +365,7 @@ def _split_array_axis_inner(kernel, array_name, axis_nr, count, order="C"): rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, var_name_gen) aash = ArrayAxisSplitHelper(rule_mapping_context, - set([array_name]), split_access_axis) + {array_name}, split_access_axis) kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel)) return kernel @@ -391,8 +387,9 @@ def split_array_axis(kernel, array_names, axis_nr, count, .. versionchanged:: 2016.2 - There was a more complicated, dumber function called :func:`split_array_dim` - that had the role of this function in versions prior to 2016.2. + There was a more complicated, dumber function called + ``loopy.split_array_dim`` that had the role of this function in + versions prior to 2016.2. """ assert isinstance(kernel, LoopKernel) @@ -449,7 +446,7 @@ def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1 @iterate_over_kernels_if_given_program def add_padding(kernel, variable, axis, align_bytes): - arg_to_idx = dict((arg.name, i) for i, arg in enumerate(kernel.args)) + arg_to_idx = {arg.name: i for i, arg in enumerate(kernel.args)} arg_idx = arg_to_idx[variable] new_args = kernel.args[:] diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py index 5c5e94028e5dfaa87f802f88bae715cfe733d6af..d93513f9833861fa3511280f36d3473f7d00cd3a 100644 --- a/loopy/transform/parameter.py +++ b/loopy/transform/parameter.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -23,7 +21,6 @@ THE SOFTWARE. """ -import six from loopy.symbolic import (RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) import islpy as isl @@ -117,7 +114,7 @@ def _fix_parameter(kernel, name, value, remove_argument): new_args.append(arg.map_exprs(map_expr)) new_temp_vars = {} - for tv in six.itervalues(kernel.temporary_variables): + for tv in kernel.temporary_variables.values(): new_temp_vars[tv.name] = tv.map_exprs(map_expr) from loopy.match import parse_stack_match @@ -155,7 +152,7 @@ def fix_parameters(kernel, **value_dict): remove_arg = value_dict.pop("_remove", True) - for name, value in six.iteritems(value_dict): + for name, value in value_dict.items(): kernel = _fix_parameter(kernel, name, value, remove_arg) return kernel diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 8837cc3d574d751ac9bf5af4dc04250e6ef87d33..7d052730fc6ca808080f4f5a343d41266ae02dba 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import, print_function - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -23,8 +21,6 @@ THE SOFTWARE. """ -import six -from six.moves import range, zip import islpy as isl from loopy.symbolic import (get_dependencies, RuleAwareIdentityMapper, RuleAwareSubstitutionMapper, @@ -66,7 +62,7 @@ def storage_axis_exprs(storage_axis_sources, args): class RuleInvocationGatherer(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, kernel, subst_name, subst_tag, within): - super(RuleInvocationGatherer, self).__init__(rule_mapping_context) + super().__init__(rule_mapping_context) from loopy.symbolic import SubstitutionRuleExpander self.subst_expander = SubstitutionRuleExpander( @@ -91,7 +87,7 @@ class RuleInvocationGatherer(RuleAwareIdentityMapper): expn_state.stack) if not process_me: - return super(RuleInvocationGatherer, self).map_substitution( + return super().map_substitution( name, tag, arguments, expn_state) rule = self.rule_mapping_context.old_subst_rules[name] @@ -99,7 +95,7 @@ class RuleInvocationGatherer(RuleAwareIdentityMapper): name, rule.arguments, arguments, expn_state.arg_context) arg_deps = set() - for arg_val in six.itervalues(arg_context): + for arg_val in arg_context.values(): arg_deps = (arg_deps | get_dependencies(self.subst_expander(arg_val))) @@ -116,7 +112,7 @@ class RuleInvocationGatherer(RuleAwareIdentityMapper): ", ".join(arg_deps - self.kernel.all_inames()), )) - return super(RuleInvocationGatherer, self).map_substitution( + return super().map_substitution( name, tag, arguments, expn_state) args = [arg_context[arg_name] for arg_name in rule.arguments] @@ -141,7 +137,7 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): non1_storage_axis_names, temporary_name, compute_insn_id, compute_dep_id, compute_read_variables): - super(RuleInvocationReplacer, self).__init__(rule_mapping_context) + super().__init__(rule_mapping_context) self.subst_name = subst_name self.subst_tag = subst_tag @@ -169,7 +165,7 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): expn_state.instruction, expn_state.stack) and (self.subst_tag is None or self.subst_tag == tag)): - return super(RuleInvocationReplacer, self).map_substitution( + return super().map_substitution( name, tag, arguments, expn_state) # {{{ check if in footprint @@ -184,7 +180,7 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): self.storage_axis_sources, args)) if not self.array_base_map.is_access_descriptor_in_footprint(accdesc): - return super(RuleInvocationReplacer, self).map_substitution( + return super().map_substitution( name, tag, arguments, expn_state) # }}} @@ -227,12 +223,13 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): def map_kernel(self, kernel): new_insns = [] - excluded_insn_ids = set([self.compute_insn_id, self.compute_dep_id]) + excluded_insn_ids = {self.compute_insn_id, self.compute_dep_id} for insn in kernel.instructions: self.replaced_something = False - insn = insn.with_transformed_expressions(self, kernel, insn) + insn = insn.with_transformed_expressions( + lambda expr: self(expr, kernel, insn)) if self.replaced_something: insn = insn.copy( @@ -257,7 +254,7 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): # }}} -class _not_provided(object): # noqa: N801 +class _not_provided: # noqa: N801 pass @@ -618,7 +615,7 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, name = old_name = subst.arguments[saxis] else: old_name = saxis - name = "%s_%s" % (c_subst_name, old_name) + name = f"{c_subst_name}_{old_name}" if (precompute_inames is not None and i < len(precompute_inames) diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py index d4128bd115666cf66c6f06a40823ed9d5929faab..8527023bc789c9b3c9e18fe7ad6827c82a6e7a55 100644 --- a/loopy/transform/privatize.py +++ b/loopy/transform/privatize.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2015 Andreas Kloeckner" __license__ = """ @@ -23,7 +21,6 @@ THE SOFTWARE. """ -import six from loopy.diagnostic import LoopyError import logging @@ -120,7 +117,7 @@ def privatize_temporaries_with_inames( # {{{ find variables that need extra indices - for tv in six.itervalues(kernel.temporary_variables): + for tv in kernel.temporary_variables.values(): if only_var_names is not None and tv.name not in only_var_names: continue @@ -158,7 +155,7 @@ def privatize_temporaries_with_inames( from loopy.symbolic import pw_aff_to_expr priv_axis_iname_to_length = {} - for priv_axis_inames in six.itervalues(var_to_new_priv_axis_iname): + for priv_axis_inames in var_to_new_priv_axis_iname.values(): for iname in priv_axis_inames: if iname in priv_axis_iname_to_length: continue @@ -177,7 +174,7 @@ def privatize_temporaries_with_inames( from loopy.kernel.data import VectorizeTag new_temp_vars = kernel.temporary_variables.copy() - for tv_name, inames in six.iteritems(var_to_new_priv_axis_iname): + for tv_name, inames in var_to_new_priv_axis_iname.items(): tv = new_temp_vars[tv_name] extra_shape = tuple(priv_axis_iname_to_length[iname] for iname in inames) @@ -199,9 +196,9 @@ def privatize_temporaries_with_inames( # }}} from pymbolic import var - var_to_extra_iname = dict( - (var_name, tuple(var(iname) for iname in inames)) - for var_name, inames in six.iteritems(var_to_new_priv_axis_iname)) + var_to_extra_iname = { + var_name: tuple(var(iname) for iname in inames) + for var_name, inames in var_to_new_priv_axis_iname.items()} new_insns = [] diff --git a/loopy/transform/save.py b/loopy/transform/save.py index c8e9a11a052817456b77af8f0722e802b2d180fd..35a175b68fa4f81f8d14cc688856265738147716 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2016 Matt Wala" __license__ = """ @@ -25,7 +23,6 @@ THE SOFTWARE. from loopy.diagnostic import LoopyError import loopy as lp -import six from loopy.kernel.data import auto, AddressSpace from pytools import memoize_method, Record @@ -60,7 +57,7 @@ class LivenessResult(dict): for idx in range(nscheditems)) -class LivenessAnalysis(object): +class LivenessAnalysis: def __init__(self, kernel): self.kernel = kernel @@ -82,10 +79,10 @@ class LivenessAnalysis(object): elif isinstance(next_item, EnterLoop): # Account for empty loop loop_end = block_bounds[sched_idx + 1] - after = successors[loop_end] | set([sched_idx + 1]) + after = successors[loop_end] | {sched_idx + 1} elif isinstance(next_item, (LeaveLoop, RunInstruction, CallKernel, ReturnFromKernel, Barrier)): - after = set([sched_idx + 1]) + after = {sched_idx + 1} else: raise LoopyError("unexpected type of schedule item: {ty}" .format(ty=type(next_item).__name__)) @@ -94,7 +91,7 @@ class LivenessAnalysis(object): if isinstance(item, LeaveLoop): # Account for loop loop_begin = block_bounds[sched_idx] - after |= set([loop_begin]) + after |= {loop_begin} elif not isinstance(item, (EnterLoop, RunInstruction, CallKernel, ReturnFromKernel, Barrier)): raise LoopyError("unexpected type of schedule item: {ty}" @@ -105,8 +102,8 @@ class LivenessAnalysis(object): return successors def get_gen_and_kill_sets(self): - gen = dict((idx, set()) for idx in range(len(self.schedule))) - kill = dict((idx, set()) for idx in range(len(self.schedule))) + gen = {idx: set() for idx in range(len(self.schedule))} + kill = {idx: set() for idx in range(len(self.schedule))} for sched_idx, sched_item in enumerate(self.schedule): if not isinstance(sched_item, RunInstruction): @@ -186,7 +183,7 @@ class LivenessAnalysis(object): # {{{ save and reload implementation -class TemporarySaver(object): +class TemporarySaver: class PromotedTemporary(Record): """ @@ -265,15 +262,15 @@ class TemporarySaver(object): isl.Space.create_from_names( isl.DEFAULT_CONTEXT, set=[], - params=set( + params={ arg.name for arg in kernel.args - if isinstance(arg, ValueArg))))) + if isinstance(arg, ValueArg)}))) def find_accessing_instructions_in_subkernel(self, temporary, subkernel): # Find all accessing instructions in the subkernel. If base_storage is # present, this includes instructions that access aliasing memory. - aliasing_names = set([temporary]) + aliasing_names = {temporary} base_storage = self.kernel.temporary_variables[temporary].base_storage if base_storage is not None: @@ -305,7 +302,7 @@ class TemporarySaver(object): result = defaultdict(set) - for temporary in six.itervalues(self.kernel.temporary_variables): + for temporary in self.kernel.temporary_variables.values(): if temporary.base_storage is None: continue result[temporary.base_storage].add(temporary.name) @@ -512,7 +509,7 @@ class TemporarySaver(object): self.new_subdomain = new_subdomain save_or_load_insn_id = self.insn_name_gen( - "{name}.{mode}".format(name=temporary, mode=mode)) + f"{temporary}.{mode}") def add_subscript_if_subscript_nonempty(agg, subscript=()): from pymbolic.primitives import Subscript, Variable @@ -550,10 +547,10 @@ class TemporarySaver(object): pre_barrier, post_barrier = self.get_enclosing_global_barrier_pair(subkernel) if pre_barrier is not None: - depends_on |= set([pre_barrier]) + depends_on |= {pre_barrier} if post_barrier is not None: - update_deps |= set([post_barrier]) + update_deps |= {post_barrier} # Create the load / store instruction. from loopy.kernel.data import Assignment @@ -564,9 +561,7 @@ class TemporarySaver(object): self.subkernel_to_surrounding_inames[subkernel] | frozenset(hw_inames + dim_inames)), within_inames_is_final=True, - depends_on=depends_on, - boostable=False, - boostable_into=frozenset()) + depends_on=depends_on) if mode == "save": self.temporary_to_save_ids[temporary].add(save_or_load_insn_id) @@ -591,7 +586,7 @@ class TemporarySaver(object): def finish(self): new_instructions = [] - insns_to_insert = dict((insn.id, insn) for insn in self.insns_to_insert) + insns_to_insert = {insn.id: insn for insn in self.insns_to_insert} for orig_insn in self.kernel.instructions: if orig_insn.id in self.insns_to_update: @@ -764,7 +759,7 @@ def save_and_reload_temporaries(program): from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) - for sched_idx, sched_item in enumerate(knl.schedule): + for sched_idx, sched_item in enumerate(program.root_kernel.schedule): if isinstance(sched_item, CallKernel): # Any written temporary that is live-out needs to be read into @@ -775,25 +770,26 @@ def save_and_reload_temporaries(program): else: subkernel = sched_item.kernel_name interesting_temporaries = ( - temporaries_read_in_subkernel(knl, subkernel) - | temporaries_written_in_subkernel(knl, subkernel)) + temporaries_read_in_subkernel(program.root_kernel, subkernel) + | temporaries_written_in_subkernel(program.root_kernel, + subkernel)) for temporary in liveness[sched_idx].live_out & interesting_temporaries: - logger.info("reloading {0} at entry of {1}" + logger.info("reloading {} at entry of {}" .format(temporary, sched_item.kernel_name)) saver.reload(temporary, sched_item.kernel_name) elif isinstance(sched_item, ReturnFromKernel): - if sched_idx == len(knl.schedule) - 1: + if sched_idx == len(program.root_kernel.schedule) - 1: # Kernel exit: nothing live interesting_temporaries = set() else: subkernel = sched_item.kernel_name interesting_temporaries = ( - temporaries_written_in_subkernel(knl, subkernel)) + temporaries_written_in_subkernel(program.root_kernel, subkernel)) for temporary in liveness[sched_idx].live_in & interesting_temporaries: - logger.info("saving {0} before return of {1}" + logger.info("saving {} before return of {}" .format(temporary, sched_item.kernel_name)) saver.save(temporary, sched_item.kernel_name) diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index 725e6792055b6ea7dc0b8663204a264540f79fd3..d7aaf6093fbcd8cd84667e55f44e13c129d3bef0 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -22,11 +20,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six - from loopy.symbolic import ( RuleAwareIdentityMapper, SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError +from loopy.transform.iname import remove_any_newly_unused_inames from pytools import ImmutableRecord from pymbolic import var @@ -103,8 +100,8 @@ def extract_subst(kernel, subst_name, template, parameters=()): ExprDescriptor( insn=insn, expr=expr, - unif_var_dict=dict((lhs.name, rhs) - for lhs, rhs in urec.equations))) + unif_var_dict={lhs.name: rhs + for lhs, rhs in urec.equations})) else: mapper.fallback_mapper(expr) # can't nest, don't recurse @@ -117,7 +114,7 @@ def extract_subst(kernel, subst_name, template, parameters=()): dfmapper(insn.assignees) dfmapper(insn.expression) - for sr in six.itervalues(kernel.substitutions): + for sr in kernel.substitutions.values(): dfmapper(sr.expression) # }}} @@ -151,8 +148,30 @@ def extract_subst(kernel, subst_name, template, parameters=()): new_insns = [] + def transform_assignee(expr): + # Assignment LHS's cannot be subst rules. Treat them + # specially. + + import pymbolic.primitives as prim + if isinstance(expr, tuple): + return tuple( + transform_assignee(expr_i) + for expr_i in expr) + + elif isinstance(expr, prim.Subscript): + return type(expr)( + expr.aggregate, + cbmapper(expr.index)) + + elif isinstance(expr, prim.Variable): + return expr + else: + raise ValueError("assignment LHS not understood") + for insn in kernel.instructions: - new_insns.append(insn.with_transformed_expressions(cbmapper)) + new_insns.append( + insn.with_transformed_expressions( + cbmapper, assignee_f=transform_assignee)) from loopy.kernel.data import SubstitutionRule new_substs = { @@ -162,7 +181,7 @@ def extract_subst(kernel, subst_name, template, parameters=()): expression=template, )} - for subst in six.itervalues(kernel.substitutions): + for subst in kernel.substitutions.values(): new_substs[subst.name] = subst.copy( expression=cbmapper(subst.expression)) @@ -183,7 +202,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper): usage_to_definition, extra_arguments, within): self.var_name_gen = rule_mapping_context.make_unique_var_name - super(AssignmentToSubstChanger, self).__init__(rule_mapping_context) + super().__init__(rule_mapping_context) self.lhs_name = lhs_name self.definition_insn_ids = definition_insn_ids @@ -215,7 +234,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper): if result is not None: return result - return super(AssignmentToSubstChanger, self).map_variable( + return super().map_variable( expr, expn_state) def map_subscript(self, expr, expn_state): @@ -225,7 +244,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper): if result is not None: return result - return super(AssignmentToSubstChanger, self).map_subscript( + return super().map_subscript( expr, expn_state) def transform_access(self, index, expn_state): @@ -261,6 +280,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper): @iterate_over_kernels_if_given_program +@remove_any_newly_unused_inames def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, force_retain_argument=False): """Extract an assignment (to a temporary variable or an argument) @@ -363,7 +383,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, # {{{ create new substitution rules new_substs = kernel.substitutions.copy() - for def_id, subst_name in six.iteritems(tts.definition_insn_id_to_subst_name): + for def_id, subst_name in tts.definition_insn_id_to_subst_name.items(): def_insn = kernel.id_to_insn[def_id] from loopy.kernel.data import Assignment @@ -404,7 +424,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, new_args = kernel.args if lhs_name in kernel.temporary_variables: - if not any(six.itervalues(tts.saw_unmatched_usage_sites)): + if not any(tts.saw_unmatched_usage_sites.values()): # All usage sites matched--they're now substitution rules. # We can get rid of the variable. @@ -412,7 +432,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, del new_temp_vars[lhs_name] if lhs_name in kernel.arg_dict and not force_retain_argument: - if not any(six.itervalues(tts.saw_unmatched_usage_sites)): + if not any(tts.saw_unmatched_usage_sites.values()): # All usage sites matched--they're now substitution rules. # We can get rid of the argument @@ -427,11 +447,10 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, import loopy as lp kernel = lp.remove_instructions( kernel, - set( + { insn_id - for insn_id, still_used in six.iteritems( - tts.saw_unmatched_usage_sites) - if not still_used)) + for insn_id, still_used in tts.saw_unmatched_usage_sites.items() + if not still_used}) return kernel.copy( substitutions=new_substs, @@ -475,7 +494,7 @@ def expand_subst(kernel, within=None): # {{{ find substitution rules by glob patterns -def find_rules_matching(knl, pattern): +def find_rules_matching(kernel, pattern): """ :pattern: A shell-style glob pattern. """ @@ -483,7 +502,7 @@ def find_rules_matching(knl, pattern): from loopy.match import re_from_glob pattern = re_from_glob(pattern) - return [r for r in knl.substitutions if pattern.match(r)] + return [r for r in kernel.substitutions if pattern.match(r)] def find_one_rule_matching(program, pattern): diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 0d4430e0dd61f35d6c53d8d176449fbd67722cf9..7718988aab98fbf26c221110d4f02487cc675fa3 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012-16 Andreas Kloeckner" __license__ = """ @@ -22,8 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six - from pymbolic.mapper import CombineMapper import numpy as np @@ -49,7 +45,7 @@ logger = logging.getLogger(__name__) def _debug(kernel, s, *args): if logger.isEnabledFor(logging.DEBUG): logstr = s % args - logger.debug("%s: %s" % (kernel.name, logstr)) + logger.debug(f"{kernel.name}: {logstr}") def get_return_types_as_tuple(arg_id_to_dtype): @@ -58,8 +54,8 @@ def get_return_types_as_tuple(arg_id_to_dtype): :arg arg_id_to_dtype: An instance of :class:`dict` which denotes a mapping from the arguments to their inferred types. """ - return_arg_id_to_dtype = dict((id, dtype) for id, dtype in - arg_id_to_dtype.items() if (isinstance(id, int) and id < 0)) + return_arg_id_to_dtype = {id: dtype for id, dtype in + arg_id_to_dtype.items() if (isinstance(id, int) and id < 0)} return_arg_pos = sorted(return_arg_id_to_dtype.keys(), reverse=True) return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos) @@ -75,7 +71,7 @@ class FunctionNameChanger(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, calls_to_new_names, subst_expander): - super(FunctionNameChanger, self).__init__(rule_mapping_context) + super().__init__(rule_mapping_context) self.calls_to_new_names = calls_to_new_names self.subst_expander = subst_expander @@ -98,7 +94,7 @@ class FunctionNameChanger(RuleAwareIdentityMapper): tuple(self.rec(child, expn_state) for child in expanded_expr.parameters)) else: - return super(FunctionNameChanger, self).map_call( + return super().map_call( expr, expn_state) else: return self.map_substitution(name, tag, expr.parameters, expn_state) @@ -110,12 +106,12 @@ class FunctionNameChanger(RuleAwareIdentityMapper): ResolvedFunction(self.calls_to_new_names[expr]), tuple(self.rec(child, expn_state) for child in expr.parameters), - dict( - (key, self.rec(val, expn_state)) - for key, val in six.iteritems(expr.kw_parameters)) + { + key: self.rec(val, expn_state) + for key, val in expr.kw_parameters.items()} ) else: - return super(FunctionNameChanger, self).map_call_with_kwargs( + return super().map_call_with_kwargs( expr, expn_state) @@ -219,7 +215,7 @@ class TypeInferenceMapper(CombineMapper): if return_tuple: kwargs["return_tuple"] = True - result = super(TypeInferenceMapper, self).__call__( + result = super().__call__( expr, **kwargs) assert isinstance(result, list) @@ -396,7 +392,7 @@ class TypeInferenceMapper(CombineMapper): def map_type_cast(self, expr): subtype, = self.rec(expr.child) if not issubclass(subtype.dtype.type, np.number): - raise LoopyError("Can't cast a '%s' to '%s'" % (subtype, expr.type)) + raise LoopyError(f"Can't cast a '{subtype}' to '{expr.type}'") return [expr.type] def map_subscript(self, expr): @@ -426,8 +422,8 @@ class TypeInferenceMapper(CombineMapper): else: return None - arg_id_to_dtype = dict((i, none_if_empty(self.rec(par))) for (i, par) in - tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())) + arg_id_to_dtype = {i: none_if_empty(self.rec(par)) for (i, par) in + tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())} # specializing the known function wrt type if isinstance(expr.function, ResolvedFunction): @@ -525,11 +521,11 @@ class TypeInferenceMapper(CombineMapper): ValueArgDescriptor) # creating arg_id_to_dtype, arg_id_to_descr from arg_dtypes - arg_id_to_dtype = dict((i, dt.with_target(self.kernel.target)) - for i, dt in enumerate(mangle_result.arg_dtypes)) - arg_id_to_dtype.update(dict((-i-1, - dtype.with_target(self.kernel.target)) for i, dtype in enumerate( - mangle_result.result_dtypes))) + arg_id_to_dtype = {i: dt.with_target(self.kernel.target) + for i, dt in enumerate(mangle_result.arg_dtypes)} + arg_id_to_dtype.update({-i-1: + dtype.with_target(self.kernel.target) for i, dtype in enumerate( + mangle_result.result_dtypes)}) arg_descrs = tuple((i, ValueArgDescriptor()) for i, _ in enumerate(mangle_result.arg_dtypes)) res_descrs = tuple((-i-1, ValueArgDescriptor()) for i, _ in @@ -726,11 +722,11 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if isinstance(writer_insn, lp.Assignment): result = type_inf_mapper(expr, return_dtype_set=True) elif isinstance(writer_insn, lp.CallInstruction): - return_dtype_set = type_inf_mapper(expr, return_tuple=True, + return_dtype_sets = type_inf_mapper(expr, return_tuple=True, return_dtype_set=True) result = [] - for return_dtype_set in return_dtype_set: + for return_dtype_set in return_dtype_sets: result_i = None found = False for assignee, comp_dtype_set in zip( @@ -810,7 +806,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, callables_table, names_for_type_inference = [] import loopy as lp - for tv in six.itervalues(kernel.temporary_variables): + for tv in kernel.temporary_variables.values(): assert tv.dtype is not lp.auto if tv.dtype is None: names_for_type_inference.append(tv.name) @@ -827,15 +823,15 @@ def infer_unknown_types_for_a_single_kernel(kernel, callables_table, writer_map = kernel.writer_map() - dep_graph = dict( - (written_var, set( + dep_graph = { + written_var: { read_var for insn_id in writer_map.get(written_var, []) for read_var in kernel.id_to_insn[insn_id].read_dependency_names() - if read_var in names_for_type_inference)) - for written_var in names_for_type_inference) + if read_var in names_for_type_inference} + for written_var in names_for_type_inference} - from loopy.tools import compute_sccs + from pytools.graph import compute_sccs # To speed up processing, we sort the variables by computing the SCCs of the # type dependency graph. Each SCC represents a set of variables whose types diff --git a/loopy/types.py b/loopy/types.py index 4e77317c105a1f8b6acb61029ae6d81533d60372..2457049073eab8c73202e324514526097b56c4d1 100644 --- a/loopy/types.py +++ b/loopy/types.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -22,13 +20,28 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six # noqa import numpy as np from loopy.diagnostic import LoopyError +__doc__ = """ +.. currentmodule:: loopy.types + +.. autoclass:: LoopyType + +.. autoclass:: NumpyType + +.. autoclass:: AtomicType + +.. autoclass:: AtomicNumpyType +""" -class LoopyType(object): + +class LoopyType: + """ + Abstract class for dtypes of variables encountered in a + :class:`loopy.LoopKernel`. + """ def with_target(self, target): return self @@ -55,7 +68,10 @@ class LoopyType(object): class AtomicType(LoopyType): - pass + """ + Abstract class for dtypes of variables encountered in a :class:`loopy.LoopKernel` + on which atomic operations are performed . + """ # {{{ numpy-based dtype @@ -137,7 +153,7 @@ class NumpyType(LoopyType): else: return any( dtype_involves_complex(f[0]) - for f in six.itervalues(dtype.fields)) + for f in dtype.fields.values()) return dtype_involves_complex(self.dtype) diff --git a/loopy/version.py b/loopy/version.py index 29abbc2de889b884de93e5fe39a1d996811c93c9..fddd44479adcae87ec96f470a690274b154fde54 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -42,7 +42,7 @@ else: # }}} -VERSION = (2019, 1) +VERSION = (2020, 2, 1) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS @@ -60,21 +60,17 @@ except ImportError: else: _cgen_version = cgen.version.VERSION_TEXT -DATA_MODEL_VERSION = "%s-islpy%s-cgen%s-%s-v0" % ( +DATA_MODEL_VERSION = "{}-islpy{}-cgen{}-{}-v1".format( VERSION_TEXT, _islpy_version, _cgen_version, _git_rev) -FALLBACK_LANGUAGE_VERSION = (2017, 2, 1) +FALLBACK_LANGUAGE_VERSION = (2018, 2) MOST_RECENT_LANGUAGE_VERSION = (2018, 2) LOOPY_USE_LANGUAGE_VERSION_2018_2 = (2018, 2) -LOOPY_USE_LANGUAGE_VERSION_2018_1 = (2018, 1) -LOOPY_USE_LANGUAGE_VERSION_2017_2_1 = (2017, 2, 1) LANGUAGE_VERSION_SYMBOLS = [ "LOOPY_USE_LANGUAGE_VERSION_2018_2", - "LOOPY_USE_LANGUAGE_VERSION_2018_1", - "LOOPY_USE_LANGUAGE_VERSION_2017_2_1", ] __doc__ = """ @@ -102,7 +98,7 @@ language version to let them take advantage of this check. As a result, :mod:`loopy` will now issue a warning when a call to :func:`loopy.make_kernel` does not declare a language version. Such kernels -will (indefinitely) default to language version 2017.2.1. If passing a +will (indefinitely) default to language version 2018.2. If passing a language version to :func:`make_kernel` is impractical, you may also import one of the ``LOOPY_USE_LANGUAGE_VERSION_...`` symbols given below using:: @@ -129,14 +125,16 @@ History of Language Versions .. data:: LOOPY_USE_LANGUAGE_VERSION_2018_2 - :attr:`loopy.Options.ignore_boostable_into` is turned on by default. + ``loopy.Options.ignore_boostable_into`` is turned on by default. .. data:: LOOPY_USE_LANGUAGE_VERSION_2018_1 - :attr:`loopy.Options.enforce_variable_access_ordered` - is turned on by default. + :attr:`loopy.Options.enforce_variable_access_ordered` is turned on by + default. Unsupported from :mod:`loopy` version 2020.2 onwards. .. data:: LOOPY_USE_LANGUAGE_VERSION_2017_2_1 - Initial legacy language version. + Initial legacy language version. Unsupported from :mod:`loopy` version + 2020.2 onwards. + """ diff --git a/proto-tests/test_fem_assembly.py b/proto-tests/test_fem_assembly.py index 18f2a5bfabdd52abad9d78aacf4f1d5be53b5ac1..dde093d53be125c2b1eaf13022d51b3300b61314 100644 --- a/proto-tests/test_fem_assembly.py +++ b/proto-tests/test_fem_assembly.py @@ -1,5 +1,3 @@ -from __future__ import division - import numpy as np import pyopencl as cl import loopy as lp diff --git a/proto-tests/test_sem.py b/proto-tests/test_sem.py index 4613b74ae787fe086ead935ddec61ff1a5438521..b84d072d0546270e6d21702f7b0f5b6354f7a238 100644 --- a/proto-tests/test_sem.py +++ b/proto-tests/test_sem.py @@ -1,5 +1,3 @@ -from __future__ import division - import numpy as np import pyopencl as cl import loopy as lp diff --git a/proto-tests/test_sem_tim.py b/proto-tests/test_sem_tim.py index 1bfb437fb6de1cb5511d108eb35a8ad32326122e..9d8dfcfa680fc484f20c9511b34210b15af8d635 100644 --- a/proto-tests/test_sem_tim.py +++ b/proto-tests/test_sem_tim.py @@ -1,5 +1,3 @@ -from __future__ import division - import numpy as np import pyopencl as cl import loopy as lp diff --git a/proto-tests/test_tim.py b/proto-tests/test_tim.py index d7061933e5667a623b4157ea6900a4b13c55e6c4..773821dce08adb758f49da6e8a6102011005beec 100644 --- a/proto-tests/test_tim.py +++ b/proto-tests/test_tim.py @@ -1,5 +1,3 @@ -from __future__ import division - import numpy as np import pyopencl as cl import loopy as lp diff --git a/requirements.txt b/requirements.txt index 97c2024764715d0a715520800e2e1dd467183479..2105aede063c65752ef4a9262eb960f749778a8a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,12 @@ -git+https://github.com/inducer/pytools.git -git+https://github.com/inducer/islpy.git -git+https://github.com/inducer/cgen.git -git+https://github.com/inducer/pyopencl.git -git+https://github.com/inducer/pymbolic.git -git+https://github.com/inducer/genpy.git -git+https://github.com/inducer/codepy.git +git+https://github.com/inducer/pytools.git#egg=pytools +git+https://github.com/inducer/islpy.git#egg=islpy +git+https://github.com/inducer/cgen.git#egg=cgen +git+https://github.com/inducer/pyopencl.git#egg=pyopencl +git+https://github.com/inducer/pymbolic.git#egg=pymbolic +git+https://github.com/inducer/genpy.git#egg=genpy +git+https://github.com/inducer/codepy.git#egg=codepy git+https://github.com/inducer/f2py # Optional, needed for using the C preprocessor on Fortran -ply>=3.6 \ No newline at end of file +ply>=3.6 diff --git a/setup.cfg b/setup.cfg index a0d95746e1a399d6a2d7c315bffc9b834d2f5487..9495d106cf389d485037db16a35a14b4aaf6c873 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,3 +4,7 @@ max-line-length=85 exclude= loopy/target/c/compyte/ndarray, loopy/target/c/compyte/array.py + +inline-quotes = " +docstring-quotes = """ +multiline-quotes = """ diff --git a/setup.py b/setup.py index 92c16a0f5d03f84d87106b6ec9d25b95a00a5872..ddc47fefca853321d383bad4aeaa6f24f6d5c901 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- import os from setuptools import setup, find_packages @@ -12,7 +11,7 @@ finally: version_file.close() os.environ["AKPYTHON_EXEC_IMPORT_UNAVAILABLE"] = "1" -exec(compile(version_file_contents, "loopy/version.py", 'exec'), ver_dic) +exec(compile(version_file_contents, "loopy/version.py", "exec"), ver_dic) # {{{ capture git revision at install time @@ -34,9 +33,7 @@ def find_git_revision(tree_root): cwd=tree_root) (git_rev, _) = p.communicate() - import sys - if sys.version_info >= (3,): - git_rev = git_rev.decode() + git_rev = git_rev.decode() git_rev = git_rev.rstrip() @@ -56,7 +53,7 @@ def write_git_revision(package_name): git_rev = find_git_revision(dn) with open(join(dn, package_name, "_git_rev.py"), "w") as outf: - outf.write("GIT_REVISION = %s\n" % repr(git_rev)) + outf.write('GIT_REVISION = "%s"\n' % git_rev) write_git_revision("loopy") @@ -64,37 +61,34 @@ write_git_revision("loopy") # }}} -setup(name="loo.py", +setup(name="loopy", version=ver_dic["VERSION_TEXT"], description="A code generator for array-based code on CPUs and GPUs", - long_description=open("README.rst", "rt").read(), + long_description=open("README.rst").read(), classifiers=[ - 'Development Status :: 4 - Beta', - 'Intended Audience :: Developers', - 'Intended Audience :: Other Audience', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: MIT License', - 'Natural Language :: English', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2.6', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.2', - 'Programming Language :: Python :: 3.3', - 'Topic :: Scientific/Engineering', - 'Topic :: Scientific/Engineering :: Information Analysis', - 'Topic :: Scientific/Engineering :: Mathematics', - 'Topic :: Scientific/Engineering :: Visualization', - 'Topic :: Software Development :: Libraries', - 'Topic :: Utilities', + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Other Audience", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Scientific/Engineering :: Mathematics", + "Topic :: Scientific/Engineering :: Visualization", + "Topic :: Software Development :: Libraries", + "Topic :: Utilities", ], + python_requires="~=3.6", install_requires=[ - "pytools>=2020.1", + "pytools>=2020.4", "pymbolic>=2019.2", "genpy>=2016.1.2", "cgen>=2016.1", "islpy>=2019.1", - "six>=1.8.0", "codepy>=2017.1", "colorama", "Mako", @@ -102,7 +96,7 @@ setup(name="loo.py", extras_require={ "pyopencl": [ - "pyopencl>=2015.2", + "pyopencl>=2020.2", ], "fortran": [ # Note that this is *not* regular 'f2py2e', this is @@ -120,7 +114,7 @@ setup(name="loo.py", scripts=["bin/loopy"], author="Andreas Kloeckner", - url="http://mathema.tician.de/software/loopy", + url="https://mathema.tician.de/software/loopy", author_email="inform@tiker.net", license="MIT", packages=find_packages(), diff --git a/test/test_apps.py b/test/test_apps.py index c1ff4b893c459f9860fca3fbda8d06406676b8b5..c1d3410d9b931012874fe5ffd9a6ae866e221be3 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import, print_function - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -47,7 +45,7 @@ from loopy.diagnostic import LoopyError __all__ = [ "pytest_generate_tests", - "cl" # 'cl.create_some_context' + "cl" # "cl.create_some_context" ] @@ -101,8 +99,11 @@ def test_convolution(ctx_factory): knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.tag_inames(knl, dict(ifeat="g.2")) - knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]", default_tag="l.auto") + knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]", + fetch_outer_inames="im_x_outer, im_y_outer, ifeat", + default_tag="l.auto") knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y", + fetch_outer_inames="iimg, im_x_outer, im_y_outer, ifeat, icolor", default_tag="l.auto") return knl @@ -567,7 +568,7 @@ def test_poisson_fem(ctx_factory): sdim = 3 knl = lp.make_kernel( - "{ [c,i,j,k,ell,ell2,ell3]: \ + "{ [c,i,j,k,ell,ell2]: \ 0 <= c < nels and \ 0 <= i < nbf and \ 0 <= j < nbf and \ @@ -590,12 +591,12 @@ def test_poisson_fem(ctx_factory): knl = lp.prioritize_loops(knl, ["c", "j", "i", "k"]) def variant_1(knl): - knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for') + knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag="for") knl = lp.prioritize_loops(knl, "c,i,j") return knl def variant_2(knl): - knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for') + knl = lp.precompute(knl, "dpsi", "i,ell", default_tag="for") knl = lp.prioritize_loops(knl, "c,i,j") return knl @@ -631,10 +632,10 @@ def test_domain_tree_nesting(): TV = lp.TemporaryVariable # noqa - knl = lp.make_kernel(['{[i]: 0 <= i < 12}', - '{[j]: 0 <= j < 100}', - '{[a_count]: 0 <= a_count < a_end}', - '{[b_count]: 0 <= b_count < b_end}'], + knl = lp.make_kernel(["{[i]: 0 <= i < 12}", + "{[j]: 0 <= j < 100}", + "{[a_count]: 0 <= a_count < a_end}", + "{[b_count]: 0 <= b_count < b_end}"], """ for j for i @@ -653,15 +654,15 @@ def test_domain_tree_nesting(): end """, [ - TV('out_map', initializer=out_map, read_only=True, address_space=AS.PRIVATE), - TV('if_val', initializer=if_val, read_only=True, address_space=AS.PRIVATE), - TV('vals', initializer=vals, read_only=True, address_space=AS.PRIVATE), - TV('num_vals', initializer=num_vals, read_only=True, + TV("out_map", initializer=out_map, read_only=True, address_space=AS.PRIVATE), + TV("if_val", initializer=if_val, read_only=True, address_space=AS.PRIVATE), + TV("vals", initializer=vals, read_only=True, address_space=AS.PRIVATE), + TV("num_vals", initializer=num_vals, read_only=True, address_space=AS.PRIVATE), - TV('num_vals_offset', initializer=num_vals_offset, read_only=True, + TV("num_vals_offset", initializer=num_vals_offset, read_only=True, address_space=AS.PRIVATE), - lp.GlobalArg('B', shape=(100, 31), dtype=np.float64), - lp.GlobalArg('out', shape=(100, 12), dtype=np.float64)]) + lp.GlobalArg("B", shape=(100, 31), dtype=np.float64), + lp.GlobalArg("out", shape=(100, 12), dtype=np.float64)]) parents_per_domain = knl.root_kernel.parents_per_domain() diff --git a/test/test_c_execution.py b/test/test_c_execution.py index b1f335bbb7fdcd9cf1e53603d5b70d1a224ee140..75b4571004cfd046ba35f9407ce614bac0f5d2df 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import, print_function - __copyright__ = "Copyright (C) 2017 Nick Curtis" __license__ = """ @@ -25,7 +23,6 @@ THE SOFTWARE. import numpy as np import loopy as lp import sys -import six import pytest from loopy import CACHING_ENABLED @@ -63,30 +60,29 @@ def test_c_target(): def test_c_target_strides(): from loopy.target.c import ExecutableCTarget - def __get_kernel(order='C'): + def __get_kernel(order="C"): return lp.make_kernel( "{ [i,j]: 0<=i,j self.ubound: raise BoundsCheckError() @@ -326,11 +321,11 @@ def test_fuzz_expression_code_gen(ctx_factory, expr_type, random_seed): shape=())) data.extend([ lp.TemporaryVariable(name, get_numpy_type(val)) - for name, val in six.iteritems(var_values) + for name, val in var_values.items() ]) instructions.extend([ lp.Assignment(name, get_numpy_type(val)(val)) - for name, val in six.iteritems(var_values) + for name, val in var_values.items() ]) instructions.append(lp.Assignment(var_name, expr)) @@ -350,7 +345,7 @@ def test_fuzz_expression_code_gen(ctx_factory, expr_type, random_seed): print(knl) evt, lp_values = knl(queue, out_host=True) - for name, ref_value in six.iteritems(ref_values): + for name, ref_value in ref_values.items(): lp_value = lp_values[name] if expr_type in ["real", "complex"]: err = abs(ref_value-lp_value)/abs(ref_value) @@ -365,7 +360,7 @@ def test_fuzz_expression_code_gen(ctx_factory, expr_type, random_seed): print(80*"-") print(lp.generate_code_v2(knl).device_code()) print(80*"-") - print("WRONG: %s rel error=%g" % (name, err)) + print(f"WRONG: {name} rel error={err:g}") print("reference=%r" % ref_value) print("loopy=%r" % lp_value) print(80*"-") @@ -381,8 +376,8 @@ def test_sci_notation_literal(ctx_factory): queue = cl.CommandQueue(ctx) set_kernel = lp.make_kernel( - ''' { [i]: 0<=i<12 } ''', - ''' out[i] = 1e-12''') + """ { [i]: 0<=i<12 } """, + """ out[i] = 1e-12""") set_kernel = lp.set_options(set_kernel, write_cl=True) @@ -396,8 +391,8 @@ def test_indexof(ctx_factory): queue = cl.CommandQueue(ctx) knl = lp.make_kernel( - ''' { [i,j]: 0<=i,j<5 } ''', - ''' out[i,j] = indexof(out[i,j])''') + """ { [i,j]: 0<=i,j<5 } """, + """ out[i,j] = indexof(out[i,j])""") knl = lp.set_options(knl, write_cl=True) @@ -420,8 +415,8 @@ def test_indexof_vec(ctx_factory): pytest.skip("target ICD miscompiles vector code") knl = lp.make_kernel( - ''' { [i,j,k]: 0<=i,j,k<4 } ''', - ''' out[i,j,k] = indexof_vec(out[i,j,k])''') + """ { [i,j,k]: 0<=i,j,k<4 } """, + """ out[i,j,k] = indexof_vec(out[i,j,k])""") knl = lp.tag_inames(knl, {"i": "vec"}) knl = lp.tag_data_axes(knl, "out", "vec,c,c") @@ -479,7 +474,7 @@ def test_divide_precedence(ctx_factory): x[0] = c*(a/b) y[0] = c*(a%b) """, - [lp.ValueArg('a, b, c', np.int32), lp.GlobalArg('x, y', np.int32)]) + [lp.ValueArg("a, b, c", np.int32), lp.GlobalArg("x, y", np.int32)]) print(lp.generate_code_v2(knl).device_code()) evt, (x_out, y_out) = knl(queue, c=2, b=2, a=5) diff --git a/test/test_fortran.py b/test/test_fortran.py index 2e67116969d6b5f8fd8d7854bc2617431e3c14d9..a0f3cc7bd615c69e80f1ff2f7b706939e34501de 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2015 Andreas Kloeckner" __license__ = """ @@ -38,11 +36,11 @@ from pyopencl.tools import pytest_generate_tests_for_pyopencl \ __all__ = [ "pytest_generate_tests", - "cl" # 'cl.create_some_context' + "cl" # "cl.create_some_context" ] -pytestmark = pytest.mark.importorskip("fparser") +pytest.importorskip("fparser") def test_fp_prec_comparison(): @@ -407,8 +405,12 @@ def test_matmul(ctx_factory, buffer_inames): knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") - knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", default_tag="l.auto") - knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", default_tag="l.auto") + knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", + precompute_outer_inames="i_outer, j_outer, k_outer", + default_tag="l.auto") + knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", + precompute_outer_inames="i_outer, j_outer, k_outer", + default_tag="l.auto") knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames, init_expression="0", store_expression="base+buffer") @@ -586,9 +588,11 @@ def test_precompute_some_exist(ctx_factory): knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", precompute_inames="ktemp,itemp", + precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", precompute_inames="itemp,k2temp", + precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") ref_knl = knl diff --git a/test/test_isl.py b/test/test_isl.py index ff58a1bb315d992051018ca38992820156393192..b55224654fea8b22684fdc693704afcb558e00c6 100644 --- a/test/test_isl.py +++ b/test/test_isl.py @@ -54,18 +54,21 @@ def test_pw_aff_to_conditional_expr(): def test_subst_into_pwqpolynomial(): from pymbolic.primitives import Variable arg_dict = { - 'm': 3*Variable("nx"), - 'n': 3*Variable("ny"), - 'nx': Variable('nx'), - 'ny': Variable('ny'), - 'nz': Variable('nz')} + "m": 3*Variable("nx"), + "n": 3*Variable("ny"), + "nx": Variable("nx"), + "ny": Variable("ny"), + "nz": Variable("nz")} space = isl.Set("[nx, ny, nz] -> { []: }").space poly = isl.PwQPolynomial("[m, n] -> { (256 * m + 256 * m * n) : " "m > 0 and n > 0; 256 * m : m > 0 and n <= 0 }") from loopy.isl_helpers import subst_into_pwqpolynomial result = subst_into_pwqpolynomial(space, poly, arg_dict) - assert "(768 * nx + 2304 * nx * ny)" in str(result) + expected_pwqpoly = isl.PwQPolynomial("[nx, ny, nz] -> {" + "(768 * nx + 2304 * nx * ny) : nx > 0 and ny > 0;" + "768 * nx : nx > 0 and ny <= 0 }") + assert (result - expected_pwqpoly).is_zero() if __name__ == "__main__": diff --git a/test/test_linalg.py b/test/test_linalg.py index f075d3493195ec3364c4de0d26f92c4a987e7187..9146e84bff1bba14807504978ed5da09bc31ace4 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -1,5 +1,3 @@ -from __future__ import division - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -148,7 +146,7 @@ def test_transpose(ctx_factory): outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") - knl = lp.add_prefetch(knl, 'a', ["i_inner", "j_inner"], + knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, @@ -186,8 +184,10 @@ def test_plain_matrix_mul(ctx_factory): outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], + fetch_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ], + fetch_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, @@ -223,8 +223,12 @@ def test_variable_size_matrix_mul(ctx_factory): slabs=(0, 1)) knl = lp.split_iname(knl, "k", 8, slabs=(0, 1)) - knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") - knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], + fetch_outer_inames="i_outer, j_outer, k_outer", + default_tag="l.auto") + knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], + fetch_outer_inames="i_outer, j_outer, k_outer", + default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], @@ -263,8 +267,10 @@ def test_funny_shape_matrix_mul(ctx_factory): knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", + precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", + precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, @@ -307,8 +313,10 @@ def test_rank_one(ctx_factory): knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.add_prefetch(knl, "a") - knl = lp.add_prefetch(knl, "b") + knl = lp.add_prefetch(knl, "a", + fetch_outer_inames="i_outer, i_inner, j_outer, j_inner") + knl = lp.add_prefetch(knl, "b", + fetch_outer_inames="i_outer, i_inner, j_outer, j_inner") return knl def variant_3(knl): @@ -317,8 +325,15 @@ def test_rank_one(ctx_factory): knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag="l.auto") - knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, "a", ["i_inner"], + fetch_outer_inames="i_outer, j_outer, j_inner", + temporary_address_space=lp.AddressSpace.LOCAL, + default_tag="l.auto") + knl = lp.add_prefetch(knl, "b", ["j_inner"], + fetch_outer_inames="i_outer, j_outer, j_inner", + temporary_address_space=lp.AddressSpace.LOCAL, + default_tag="l.auto") + return knl def variant_4(knl): @@ -327,8 +342,10 @@ def test_rank_one(ctx_factory): knl = lp.split_iname(knl, "j", 256, outer_tag="g.1", slabs=(0, 1)) - knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag=None) - knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag=None) + knl = lp.add_prefetch(knl, "a", ["i_inner"], + fetch_outer_inames="i_outer, j_outer", default_tag=None) + knl = lp.add_prefetch(knl, "b", ["j_inner"], + fetch_outer_inames="i_outer, j_outer", default_tag=None) knl = lp.split_iname(knl, "i_inner", 16, inner_tag="l.0") @@ -384,7 +401,8 @@ def test_troublesome_premagma_fermi_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "j", j_reg*j_chunks, outer_tag="g.1") knl = lp.split_iname(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp") knl = lp.split_iname(knl, "k", 16) - knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"], + knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner_inner", "i_inner_outer"], + fetch_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, @@ -424,15 +442,17 @@ def test_intel_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "k", 16) #knl = lp.split_iname(knl, "k_inner", 8, outer_tag="unr") - knl = lp.add_prefetch(knl, 'a', ["i_inner_inner", "k_inner", "i_inner_outer"], + knl = lp.add_prefetch(knl, "a", ["i_inner_inner", "k_inner", "i_inner_outer"], + fetch_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") - knl = lp.add_prefetch(knl, 'b', ["j_inner_inner", "k_inner", "j_inner_outer"], + knl = lp.add_prefetch(knl, "b", ["j_inner_inner", "k_inner", "j_inner_outer"], + fetch_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") # FIXME: Grouped prefetch - #knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")], + #knl = lp.add_prefetch(knl, "a", ["k_inner", ("i_inner_inner", "i_inner_outer")], # default_tag="l.auto") - #knl = lp.add_prefetch(knl, 'b', + #knl = lp.add_prefetch(knl, "b", # ["k_inner", ("j_inner_inner", "j_inner_outer"),], default_tag="l.auto") #hints=["k_outer", "k_inner_outer", "k_inner_inner"] @@ -484,9 +504,9 @@ def test_magma_fermi_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "k", 16) knl = lp.split_iname(knl, "k_inner", 8, outer_tag="unr") # FIXME - #knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"], + #knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner_inner", "i_inner_outer"], # default_tag="l.auto") - #knl = lp.add_prefetch(knl, 'b', + #knl = lp.add_prefetch(knl, "b", # ["k_inner", ("j_inner_inner", "j_inner_outer"),], default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, @@ -528,8 +548,12 @@ def test_image_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 32) # conflict-free - knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], default_tag="l.auto") - knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, "a", ["i_inner", "k_inner"], + fetch_outer_inames="i_outer, j_outer, k_outer", + default_tag="l.auto") + knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], + fetch_outer_inames="i_outer, j_outer, k_outer", + default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], @@ -574,8 +598,8 @@ def no_test_image_matrix_mul_ilp(ctx_factory): outer_tag="ilp", inner_tag="l.0") knl = lp.split_iname(knl, "k", 2) # conflict-free? - knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], default_tag="l.auto") - knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"], + knl = lp.add_prefetch(knl, "a", ["i_inner", "k_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, "b", ["j_inner_outer", "j_inner_inner", "k_inner"], default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, @@ -608,8 +632,12 @@ def test_fancy_matrix_mul(ctx_factory): knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16, slabs=(0, 1)) - knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], default_tag="l.auto") - knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"], default_tag="l.auto") + knl = lp.add_prefetch(knl, "a", ["i_inner", "k_inner"], + fetch_outer_inames="i_outer, j_outer, k_outer", + default_tag="l.auto") + knl = lp.add_prefetch(knl, "b", ["k_inner", "j_inner"], + fetch_outer_inames="i_outer, j_outer, k_outer", + default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], @@ -640,7 +668,7 @@ def test_small_batched_matvec(ctx_factory): seq_knl = knl align_bytes = 64 - knl = lp.add_prefetch(knl, 'd[:,:]', default_tag="l.auto") + knl = lp.add_prefetch(knl, "d[:,:]", default_tag="l.auto") pad_mult = lp.find_padding_multiple(knl, "f", 0, align_bytes) knl = lp.split_array_dim(knl, ("f", 0), pad_mult) knl = lp.add_padding(knl, "f", 0, align_bytes) diff --git a/test/test_loopy.py b/test/test_loopy.py index 3c985640bae6cdb07939e1a3a752b642f6dac2e6..09d926b1376689fb7289b95910f4f9e33b651166 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import, print_function - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -22,9 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six # noqa: F401 -from six.moves import range - import sys import numpy as np import loopy as lp @@ -48,7 +43,7 @@ from pyopencl.tools import pytest_generate_tests_for_pyopencl \ __all__ = [ "pytest_generate_tests", - "cl" # 'cl.create_some_context' + "cl" # "cl.create_some_context" ] @@ -68,16 +63,16 @@ def test_globals_decl_once_with_multi_subprogram(ctx_factory): out[ii] = 2*out[ii]+cnst[ii]{id=second} """, [lp.TemporaryVariable( - 'cnst', shape=('n'), initializer=cnst, - address_space=lp.AddressSpace.GLOBAL, - read_only=True), '...']) + "cnst", initializer=cnst, + scope=lp.AddressSpace.GLOBAL, + read_only=True), "..."]) knl = lp.fix_parameters(knl, n=16) knl = lp.add_barrier(knl, "id:first", "id:second") knl = lp.split_iname(knl, "i", 2, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "ii", 2, outer_tag="g.0", inner_tag="l.0") evt, (out,) = knl(queue, a=a) - assert np.linalg.norm(out-((2*(a+cnst)+cnst))) <= 1e-15 + assert np.linalg.norm(out-(2*(a+cnst)+cnst)) <= 1e-15 def test_complicated_subst(ctx_factory): @@ -182,7 +177,7 @@ def test_simple_side_effect(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( - "{[i,j]: 0<=i,j<100}", + "{[i]: 0<=i<100}", """ a[i] = a[i] + 1 """, @@ -458,7 +453,7 @@ def test_nonlinear_index(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( - "{[i,j]: 0<=i,j src_ibox = source_boxes[isrc_box] @@ -769,7 +764,7 @@ def test_multiple_writes_to_local_temporary(): # writes are OK. knl = lp.make_kernel( - "{[i,e]: 0<=i<5 and 0<=e temp[i, 0] = 17 temp[i, 1] = 15 @@ -845,8 +840,8 @@ def test_auto_test_zero_warmup_rounds(ctx_factory): def test_variable_size_temporary(): knl = lp.make_kernel( - ''' { [i,j]: 0<=i,j z[i] = z[i+1] + z[i] {id=wr_z,dep=top} <> v[i] = 11 {id=wr_v,dep=top} - ... gbarrier {dep=wr_z:wr_v,id=yoink} - z[i] = z[i] - z[i+1] + v[i] {id=iupd, dep=wr_z} + ... gbarrier {id=yoink,dep=wr_z:wr_v} + z[i] = z[i] - z[i+1] + v[i] {id=iupd, dep=yoink} end ... gbarrier {dep=iupd,id=postloop} z[i] = z[i] - z[i+1] + v[i] {dep=postloop} @@ -1754,7 +1749,7 @@ def test_index_cse(ctx_factory): def test_ilp_and_conditionals(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel('{[k]: 0<=k Tcond = T[k] < 0.5 @@ -1769,7 +1764,7 @@ def test_ilp_and_conditionals(ctx_factory): ref_knl = knl - knl = lp.split_iname(knl, 'k', 2, inner_tag='ilp') + knl = lp.split_iname(knl, "k", 2, inner_tag="ilp") lp.auto_test_vs_ref(ref_knl, ctx, knl) @@ -1777,7 +1772,7 @@ def test_ilp_and_conditionals(ctx_factory): def test_unr_and_conditionals(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel('{[k]: 0<=k Tcond[k] = T[k] < 0.5 @@ -1792,7 +1787,7 @@ def test_unr_and_conditionals(ctx_factory): ref_knl = knl - knl = lp.split_iname(knl, 'k', 2, inner_tag='unr') + knl = lp.split_iname(knl, "k", 2, inner_tag="unr") lp.auto_test_vs_ref(ref_knl, ctx, knl) @@ -1800,7 +1795,7 @@ def test_unr_and_conditionals(ctx_factory): def test_constant_array_args(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel('{[k]: 0<=k Tcond[k] = T[k] < 0.5 @@ -1809,8 +1804,8 @@ def test_constant_array_args(ctx_factory): end end """, - [lp.ConstantArg('T', shape=(200,), dtype=np.float32), - '...']) + [lp.ConstantArg("T", shape=(200,), dtype=np.float32), + "..."]) knl = lp.fix_parameters(knl, n=200) @@ -1871,33 +1866,33 @@ def test_const_temp_with_initializer_not_saved(): def test_header_extract(): - knl = lp.make_kernel('{[k]: 0<=k {[]: }")], # empty (domain w/unused inames errors) "a = 1", [lp.TemporaryVariable("a", dtype=np.float64, shape=(), base_storage="base")]) @@ -2090,38 +2086,37 @@ def test_integer_reduction(ctx_factory): n = 200 for vtype in [np.int32, np.int64]: var_int = np.random.randint(1000, size=n).astype(vtype) - var_lp = lp.TemporaryVariable('var', initializer=var_int, + var_lp = lp.TemporaryVariable("var", initializer=var_int, read_only=True, address_space=lp.AddressSpace.PRIVATE, dtype=to_loopy_type(vtype), shape=lp.auto) from collections import namedtuple - ReductionTest = namedtuple('ReductionTest', 'kind, check, args') + ReductionTest = namedtuple("ReductionTest", "kind, check, args") reductions = [ - ReductionTest('max', lambda x: x == np.max(var_int), args='var[k]'), - ReductionTest('min', lambda x: x == np.min(var_int), args='var[k]'), - ReductionTest('sum', lambda x: x == np.sum(var_int), args='var[k]'), - ReductionTest('product', lambda x: x == np.prod(var_int), args='var[k]'), - ReductionTest('argmax', + ReductionTest("max", lambda x: x == np.max(var_int), args="var[k]"), + ReductionTest("min", lambda x: x == np.min(var_int), args="var[k]"), + ReductionTest("sum", lambda x: x == np.sum(var_int), args="var[k]"), + ReductionTest("product", lambda x: x == np.prod(var_int), args="var[k]"), + ReductionTest("argmax", lambda x: ( x[0] == np.max(var_int) and var_int[out[1]] == np.max(var_int)), - args='var[k], k'), - ReductionTest('argmin', + args="var[k], k"), + ReductionTest("argmin", lambda x: ( x[0] == np.min(var_int) and var_int[out[1]] == np.min(var_int)), - args='var[k], k') + args="var[k], k") ] for reduction, function, args in reductions: - kstr = ("out" if 'arg' not in reduction + kstr = ("out" if "arg" not in reduction else "out[0], out[1]") - kstr += ' = {0}(k, {1})'.format(reduction, args) - knl = lp.make_kernel('{[k]: 0<=kind = indirect(offsets[i], offsets[i + 1], 1) out[i] = data[ind] end """, - [lp.GlobalArg('out', shape=('n',)), + [lp.GlobalArg("out", shape=("n",)), lp.TemporaryVariable( - 'offsets', shape=(offsets.size,), initializer=offsets, + "offsets", shape=(offsets.size,), initializer=offsets, address_space=lp.AddressSpace.GLOBAL, read_only=True), - lp.GlobalArg('data', shape=(data.size,), dtype=np.float64)], + lp.GlobalArg("data", shape=(data.size,), dtype=np.float64)], ) # fixt params, and add manglers / preamble @@ -2557,13 +2549,13 @@ def test_preamble_with_separate_temporaries(ctx_factory): SeparateTemporariesPreambleTestPreambleGenerator, ) func_info = dict( - func_name='indirect', + func_name="indirect", func_arg_dtypes=(np.int32, np.int32, np.int32), func_result_dtypes=(np.int32,), arr=lookup ) - kernel = lp.fix_parameters(kernel, **{'n': n}) + kernel = lp.fix_parameters(kernel, **{"n": n}) kernel = lp.register_preamble_generators( kernel, [SeparateTemporariesPreambleTestPreambleGenerator(**func_info)]) kernel = lp.register_function_manglers( @@ -2575,7 +2567,7 @@ def test_preamble_with_separate_temporaries(ctx_factory): queue = cl.CommandQueue(ctx) # check that it actually performs the lookup correctly assert np.allclose(kernel( - queue, data=data.flatten('C'))[1][0], data[offsets[:-1] + 1]) + queue, data=data.flatten("C"))[1][0], data[offsets[:-1] + 1]) def test_arg_inference_for_predicates(): @@ -2705,7 +2697,7 @@ def test_dep_cycle_printing_and_error(): # https://gitlab.tiker.net/inducer/loopy/issues/140 # This kernel has two dep cycles. - knl = lp.make_kernel('{[i,j,k]: 0 <= i,j,k < 12}', + knl = lp.make_kernel("{[i,j,k]: 0 <= i,j,k < 12}", """ for j for i @@ -2725,11 +2717,11 @@ def test_dep_cycle_printing_and_error(): end end """, - [lp.GlobalArg('a', shape=(12, 12), dtype=np.int32)]) + [lp.GlobalArg("a", shape=(12, 12), dtype=np.int32)]) - knl = lp.split_iname(knl, 'j', 4, inner_tag='vec') - knl = lp.split_array_axis(knl, 'a', 1, 4) - knl = lp.tag_array_axes(knl, 'a', 'N1,N0,vec') + knl = lp.split_iname(knl, "j", 4, inner_tag="vec") + knl = lp.split_array_axis(knl, "a", 1, 4) + knl = lp.tag_array_axes(knl, "a", "N1,N0,vec") knl = lp.preprocess_kernel(knl) from loopy.diagnostic import DependencyCycleFound @@ -2748,7 +2740,7 @@ def test_backwards_dep_printing_and_error(): d[i] = 7*a[i ] {id=insn5, dep=insn4} a[i] = a[i] + d[i] {id=insn6, dep=insn5} """, [ - lp.GlobalArg('a, b', dtype=np.float64), + lp.GlobalArg("a, b", dtype=np.float64), "..." ]) @@ -2829,9 +2821,9 @@ def test_shape_mismatch_check(ctx_factory): def test_array_arg_extra_kwargs_persis_hash(): from loopy.tools import LoopyKeyBuilder - a = lp.ArrayArg('a', shape=(10, ), dtype=np.float64, + a = lp.ArrayArg("a", shape=(10, ), dtype=np.float64, address_space=lp.AddressSpace.LOCAL) - not_a = lp.ArrayArg('a', shape=(10, ), dtype=np.float64, + not_a = lp.ArrayArg("a", shape=(10, ), dtype=np.float64, address_space=lp.AddressSpace.PRIVATE) key_builder = LoopyKeyBuilder() @@ -2887,13 +2879,79 @@ def test_non_integral_array_idx_raises(): """ out[j] = 0 {id=init} out[i] = a[1.94**i-1] {dep=init} - """, [lp.GlobalArg('a', np.float64), '...']) + """, [lp.GlobalArg("a", np.float64), "..."]) from loopy.diagnostic import LoopyError with pytest.raises(LoopyError): print(lp.generate_code_v2(knl).device_code()) +@pytest.mark.parametrize("tag", ["for", "l.0", "g.0", "fixed"]) +def test_empty_domain(ctx_factory, tag): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + prg = lp.make_kernel( + "{[i,j]: 0 <= i < n}", + """ + for i + c = 1 + end + """) + + if tag == "fixed": + prg = lp.fix_parameters(prg, n=0) + kwargs = {} + else: + prg = lp.tag_inames(prg, {"i": tag}) + kwargs = {"n": 0} + + prg = lp.set_options(prg, write_code=True) + c = cl.array.zeros(queue, (), dtype=np.int32) + prg(queue, c=c, **kwargs) + + assert (c.get() == 0).all() + + +def test_access_check_with_conditionals(): + legal_knl = lp.make_kernel( + "{[i]: 0<=i<20}", + """ + z[i] = x[i] if i < 10 else y[i-10] + z[i] = x[i] if 0 else 2.0f + z[i] = in[i-1] if i else 3.14f + """, + [lp.GlobalArg("x,y", shape=(10,), dtype=float), + lp.GlobalArg("in", shape=(19,), dtype=float), + ...], seq_dependencies=True) + lp.generate_code_v2(legal_knl) + + illegal_knl = lp.make_kernel( + "{[i]: 0<=i<20}", + """ + z[i] = x[i] if i < 10 else y[i] + """, + [lp.GlobalArg("x,y", shape=(10,), dtype=float), + ...]) + + from loopy.diagnostic import LoopyError + with pytest.raises(LoopyError): + lp.generate_code_v2(illegal_knl) + + # current limitation: cannot handle non-affine conditions + legal_but_nonaffine_condition_knl = lp.make_kernel( + "{[i]: 0<=i<20}", + """ + z[i] = x[i] if i*i < 100 else y[i-10] + """, + [lp.GlobalArg("x,y", shape=(10,), dtype=float), + ...]) + + from loopy.diagnostic import LoopyError + with pytest.raises(LoopyError): + lp.generate_code_v2(legal_but_nonaffine_condition_knl) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_misc.py b/test/test_misc.py index 7a834a6f5d393298e97df22d47a1de3b64354a42..58ba732ac1ddd0f1f0eaff9aeb83b9b38902cb49 100644 --- a/test/test_misc.py +++ b/test/test_misc.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import, print_function - __copyright__ = "Copyright (C) 2016 Matt Wala" __license__ = """ @@ -22,9 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six # noqa import pytest -from six.moves import range import sys @@ -35,67 +31,23 @@ logger = logging.getLogger(__name__) from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa -def test_compute_sccs(): - from loopy.tools import compute_sccs - import random - - rng = random.Random(0) - - def generate_random_graph(nnodes): - graph = dict((i, set()) for i in range(nnodes)) - for i in range(nnodes): - for j in range(nnodes): - # Edge probability 2/n: Generates decently interesting inputs. - if rng.randint(0, nnodes - 1) <= 1: - graph[i].add(j) - return graph - - def verify_sccs(graph, sccs): - visited = set() - - def visit(node): - if node in visited: - return [] - else: - visited.add(node) - result = [] - for child in graph[node]: - result = result + visit(child) - return result + [node] - - for scc in sccs: - scc = set(scc) - assert not scc & visited - # Check that starting from each element of the SCC results - # in the same set of reachable nodes. - for scc_root in scc: - visited.difference_update(scc) - result = visit(scc_root) - assert set(result) == scc, (set(result), scc) - - for nnodes in range(10, 20): - for i in range(40): - graph = generate_random_graph(nnodes) - verify_sccs(graph, compute_sccs(graph)) - - def test_SetTrie(): from loopy.kernel.tools import SetTrie s = SetTrie() - s.add_or_update(set([1, 2, 3])) - s.add_or_update(set([4, 2, 1])) - s.add_or_update(set([1, 5])) + s.add_or_update({1, 2, 3}) + s.add_or_update({4, 2, 1}) + s.add_or_update({1, 5}) result = [] s.descend(lambda prefix: result.extend(prefix)) assert result == [1, 2, 3, 4, 5] with pytest.raises(ValueError): - s.add_or_update(set([1, 4])) + s.add_or_update({1, 4}) -class PickleDetector(object): +class PickleDetector: """Contains a class attribute which flags if any instance was unpickled. """ diff --git a/test/test_nbody.py b/test/test_nbody.py index 5b36ed4163c650317d8656883eeda599a3c21faa..1254be7d37e2800dda163598b2a75c44a29641b6 100644 --- a/test/test_nbody.py +++ b/test/test_nbody.py @@ -1,5 +1,3 @@ -from __future__ import division - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -77,7 +75,8 @@ def test_nbody(ctx_factory): outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 256) knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"], - ["x_fetch_j", "x_fetch_k"], default_tag=None) + ["x_fetch_j", "x_fetch_k"], + fetch_outer_inames="i_outer, j_outer", default_tag=None) knl = lp.tag_inames(knl, dict(x_fetch_k="unr", x_fetch_j="l.0")) knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None) knl = lp.prioritize_loops(knl, ["j_outer", "j_inner"]) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index de0bcf70a7f3f86152e86524486e2730522df325..74d53b07018a72ee189eefb0ea02b194bb663629 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -1,6 +1,5 @@ """gNUMA differentiation kernel, wrapped up as a test.""" -from __future__ import division __copyright__ = "Copyright (C) 2015 Andreas Kloeckner, Lucas Wilcox" @@ -30,8 +29,6 @@ import pyopencl as cl import sys import os -pytestmark = pytest.mark.importorskip("fparser") - import logging logger = logging.getLogger(__name__) @@ -51,10 +48,11 @@ from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa @pytest.mark.parametrize("Nq", [7]) @pytest.mark.parametrize("opt_level", [11]) def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa + pytest.importorskip("fparser") ctx = ctx_factory() filename = os.path.join(os.path.dirname(__file__), "strongVolumeKernels.f90") - with open(filename, "r") as sourcef: + with open(filename) as sourcef: source = sourcef.read() source = source.replace("datafloat", "real*4") @@ -91,7 +89,8 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa if opt_level == 0: tap_hsv = hsv - hsv = lp.add_prefetch(hsv, "D[:,:]", default_tag="l.auto") + hsv = lp.add_prefetch(hsv, "D[:,:]", fetch_outer_inames="e", + default_tag="l.auto") if opt_level == 1: tap_hsv = hsv diff --git a/test/test_reduction.py b/test/test_reduction.py index 96bf7d70909eada3c77048d6ccb459a6f7a69367..965e5f1ab90ce2b4afe0f519ca85623d51ceb70d 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import, print_function - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ diff --git a/test/test_scan.py b/test/test_scan.py index 101d8fc35f224c02ac6e836cbb49f65b3dd387a4..31875ce5d8ccaf824d090c17a57dfd7e347ba4d3 100644 --- a/test/test_scan.py +++ b/test/test_scan.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import, print_function - __copyright__ = """ Copyright (C) 2012 Andreas Kloeckner Copyright (C) 2016, 2017 Matt Wala @@ -366,7 +364,7 @@ def test_argmax(ctx_factory, i_tag): def check_segmented_scan_output(arr, segment_boundaries_indices, out): - class SegmentGrouper(object): + class SegmentGrouper: def __init__(self): self.seg_idx = 0 diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py index e022e92f3712d984c1ad68061d0052240ff9d20c..901affc57a63a45f4147940cfc0b9c03e57522d0 100644 --- a/test/test_sem_reagan.py +++ b/test/test_sem_reagan.py @@ -1,5 +1,3 @@ -from __future__ import division - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -48,7 +46,7 @@ def test_tim2d(ctx_factory): # K - run-time symbolic knl = lp.make_kernel( - "{[i,j,e,m,o,o2,gi]: 0<=i,j,m,o,o2= 0 and n >= k") + return k + + +@pytest.fixture +def split(vanilla): + k = lp.split_iname(vanilla, "i", 4, slabs=(1, 1)) + k = lp.prioritize_loops(k, "i_outer,i_inner") + return k + + +@pytest.fixture(params=[(1, 4), (1, 5), (4, 8)], + ids=lambda x: "{k=%s, n=%s}" % x) +def parameters(request): + return dict(zip("kn", request.param)) + + +def test_split_slabs(ctx_factory, vanilla, split, parameters): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + expect = clarray.zeros(queue, 8, dtype=np.int32) + actual = clarray.zeros(queue, 8, dtype=np.int32) + _, (expect, ) = vanilla(queue, a=expect, **parameters) + _, (actual, ) = split(queue, a=actual, **parameters) + assert np.array_equal(expect.get(), actual.get()) diff --git a/test/test_statistics.py b/test/test_statistics.py index ef5450599126df9f1acbfbcb544b2362438f2f90..c1ca86d35bf8687bda11b5068b81f9d48cfe8113 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -1,5 +1,3 @@ -from __future__ import division, print_function - __copyright__ = "Copyright (C) 2015 James Stevens" __license__ = """ @@ -22,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six import sys from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl @@ -66,16 +63,16 @@ def test_op_counter_basic(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict( + params = {"n": n, "m": m, "ell": ell} + f32add = op_map[lp.Op(np.float32, "add", CG.SUBGROUP, knl.name)].eval_with_dict( params) - f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict( + f32mul = op_map[lp.Op(np.float32, "mul", CG.SUBGROUP, knl.name)].eval_with_dict( params) - f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, knl.name)].eval_with_dict( + f32div = op_map[lp.Op(np.float32, "div", CG.SUBGROUP, knl.name)].eval_with_dict( params) - f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP, knl.name) + f64mul = op_map[lp.Op(np.dtype(np.float64), "mul", CG.SUBGROUP, knl.name) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP, knl.name) + i32add = op_map[lp.Op(np.dtype(np.int32), "add", CG.SUBGROUP, knl.name) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32add == f32mul == f32div == n*m*ell*n_subgroups @@ -101,15 +98,15 @@ def test_op_counter_reduction(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict( + params = {"n": n, "m": m, "ell": ell} + f32add = op_map[lp.Op(np.float32, "add", CG.SUBGROUP, knl.name)].eval_with_dict( params) - f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP, knl.name) + f32mul = op_map[lp.Op(np.dtype(np.float32), "mul", CG.SUBGROUP, knl.name) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32add == f32mul == n*m*ell*n_subgroups - op_map_dtype = op_map.group_by('dtype') + op_map_dtype = op_map.group_by("dtype") f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) assert f32 == f32add + f32mul @@ -137,14 +134,14 @@ def test_op_counter_logic(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict( + params = {"n": n, "m": m, "ell": ell} + f32mul = op_map[lp.Op(np.float32, "mul", CG.SUBGROUP, knl.name)].eval_with_dict( params) - f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, knl.name)].eval_with_dict( + f64add = op_map[lp.Op(np.float64, "add", CG.SUBGROUP, knl.name)].eval_with_dict( params) - f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP, knl.name) + f64div = op_map[lp.Op(np.dtype(np.float64), "div", CG.SUBGROUP, knl.name) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP, knl.name) + i32add = op_map[lp.Op(np.dtype(np.int32), "add", CG.SUBGROUP, knl.name) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32mul == n*m*n_subgroups @@ -177,22 +174,22 @@ def test_op_counter_specialops(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict( + params = {"n": n, "m": m, "ell": ell} + f32mul = op_map[lp.Op(np.float32, "mul", CG.SUBGROUP, knl.name)].eval_with_dict( params) - f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, knl.name)].eval_with_dict( + f32div = op_map[lp.Op(np.float32, "div", CG.SUBGROUP, knl.name)].eval_with_dict( params) - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict( + f32add = op_map[lp.Op(np.float32, "add", CG.SUBGROUP, knl.name)].eval_with_dict( params) - f64pow = op_map[lp.Op(np.float64, 'pow', CG.SUBGROUP, knl.name)].eval_with_dict( + f64pow = op_map[lp.Op(np.float64, "pow", CG.SUBGROUP, knl.name)].eval_with_dict( params) - f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.SUBGROUP, knl.name) + f64add = op_map[lp.Op(np.dtype(np.float64), "add", CG.SUBGROUP, knl.name) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP, knl.name) + i32add = op_map[lp.Op(np.dtype(np.int32), "add", CG.SUBGROUP, knl.name) ].eval_with_dict(params) - f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.SUBGROUP, knl.name) + f64rsq = op_map[lp.Op(np.dtype(np.float64), "func:rsqrt", CG.SUBGROUP, knl.name) ].eval_with_dict(params) - f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.SUBGROUP, knl.name) + f64sin = op_map[lp.Op(np.dtype(np.float64), "func:sin", CG.SUBGROUP, knl.name) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32div == 2*n*m*ell*n_subgroups @@ -227,25 +224,25 @@ def test_op_counter_bitwise(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} print(op_map) i32add = op_map[ - lp.Op(np.int32, 'add', CG.SUBGROUP, 'bitwise') + lp.Op(np.int32, "add", CG.SUBGROUP, "bitwise") ].eval_with_dict(params) i32bw = op_map[ - lp.Op(np.int32, 'bw', CG.SUBGROUP, 'bitwise') + lp.Op(np.int32, "bw", CG.SUBGROUP, "bitwise") ].eval_with_dict(params) i64bw = op_map[ - lp.Op(np.dtype(np.int64), 'bw', CG.SUBGROUP, 'bitwise') + lp.Op(np.dtype(np.int64), "bw", CG.SUBGROUP, "bitwise") ].eval_with_dict(params) i64mul = op_map[ - lp.Op(np.dtype(np.int64), 'mul', CG.SUBGROUP, 'bitwise') + lp.Op(np.dtype(np.int64), "mul", CG.SUBGROUP, "bitwise") ].eval_with_dict(params) i64add = op_map[ - lp.Op(np.dtype(np.int64), 'add', CG.SUBGROUP, 'bitwise') + lp.Op(np.dtype(np.int64), "add", CG.SUBGROUP, "bitwise") ].eval_with_dict(params) i64shift = op_map[ - lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP, 'bitwise') + lp.Op(np.dtype(np.int64), "shift", CG.SUBGROUP, "bitwise") ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert i32add == n*m*ell*n_subgroups @@ -280,7 +277,7 @@ def test_op_counter_triangular_domain(): knl, subgroup_size=SGS, count_redundant_work=True - )[lp.Op(np.float64, 'mul', CG.SUBGROUP, knl.name)] + )[lp.Op(np.float64, "mul", CG.SUBGROUP, knl.name)] value_dict = dict(m=13, n=200) flops = op_map.eval_with_dict(value_dict) @@ -316,34 +313,34 @@ def test_mem_access_counter_basic(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group - f32l = mem_map[lp.MemAccess('global', np.float32, + f32l = mem_map[lp.MemAccess("global", np.float32, lid_strides={}, gid_strides={}, - direction='load', variable='a', + direction="load", variable="a", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) - f32l += mem_map[lp.MemAccess('global', np.float32, + f32l += mem_map[lp.MemAccess("global", np.float32, lid_strides={}, gid_strides={}, - direction='load', variable='b', + direction="load", variable="b", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) - f64l = mem_map[lp.MemAccess('global', np.float64, + f64l = mem_map[lp.MemAccess("global", np.float64, lid_strides={}, gid_strides={}, - direction='load', variable='g', + direction="load", variable="g", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) - f64l += mem_map[lp.MemAccess('global', np.float64, + f64l += mem_map[lp.MemAccess("global", np.float64, lid_strides={}, gid_strides={}, - direction='load', variable='h', + direction="load", variable="h", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) @@ -352,15 +349,15 @@ def test_mem_access_counter_basic(): assert f32l == (3*n*m*ell)*n_subgroups assert f64l == (2*n*m)*n_subgroups - f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), + f32s = mem_map[lp.MemAccess("global", np.dtype(np.float32), lid_strides={}, gid_strides={}, - direction='store', variable='c', + direction="store", variable="c", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) - f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), + f64s = mem_map[lp.MemAccess("global", np.dtype(np.float64), lid_strides={}, gid_strides={}, - direction='store', variable='e', + direction="store", variable="e", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) @@ -386,22 +383,22 @@ def test_mem_access_counter_reduction(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group - f32l = mem_map[lp.MemAccess('global', np.float32, + f32l = mem_map[lp.MemAccess("global", np.float32, lid_strides={}, gid_strides={}, - direction='load', variable='a', + direction="load", variable="a", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) - f32l += mem_map[lp.MemAccess('global', np.float32, + f32l += mem_map[lp.MemAccess("global", np.float32, lid_strides={}, gid_strides={}, - direction='load', variable='b', + direction="load", variable="b", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) @@ -409,9 +406,9 @@ def test_mem_access_counter_reduction(): # uniform: (count-per-sub-group)*n_subgroups assert f32l == (2*n*m*ell)*n_subgroups - f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), + f32s = mem_map[lp.MemAccess("global", np.dtype(np.float32), lid_strides={}, gid_strides={}, - direction='store', variable='c', + direction="store", variable="c", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) @@ -419,9 +416,9 @@ def test_mem_access_counter_reduction(): # uniform: (count-per-sub-group)*n_subgroups assert f32s == (n*ell)*n_subgroups - ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'] + ld_bytes = mem_map.filter_by(mtype=["global"], direction=["load"] ).to_bytes().eval_and_sum(params) - st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'] + st_bytes = mem_map.filter_by(mtype=["global"], direction=["store"] ).to_bytes().eval_and_sum(params) assert ld_bytes == 4*f32l assert st_bytes == 4*f32s @@ -447,23 +444,23 @@ def test_mem_access_counter_logic(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group - reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') + reduced_map = mem_map.group_by("mtype", "dtype", "direction") - f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32), - direction='load') + f32_g_l = reduced_map[lp.MemAccess("global", to_loopy_type(np.float32), + direction="load") ].eval_with_dict(params) - f64_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64), - direction='load') + f64_g_l = reduced_map[lp.MemAccess("global", to_loopy_type(np.float64), + direction="load") ].eval_with_dict(params) - f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64), - direction='store') + f64_g_s = reduced_map[lp.MemAccess("global", to_loopy_type(np.float64), + direction="store") ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -492,34 +489,34 @@ def test_mem_access_counter_specialops(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group - f32 = mem_map[lp.MemAccess('global', np.float32, + f32 = mem_map[lp.MemAccess("global", np.float32, lid_strides={}, gid_strides={}, - direction='load', variable='a', + direction="load", variable="a", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) - f32 += mem_map[lp.MemAccess('global', np.float32, + f32 += mem_map[lp.MemAccess("global", np.float32, lid_strides={}, gid_strides={}, - direction='load', variable='b', + direction="load", variable="b", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) - f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64), + f64 = mem_map[lp.MemAccess("global", np.dtype(np.float64), lid_strides={}, gid_strides={}, - direction='load', variable='g', + direction="load", variable="g", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) - f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), + f64 += mem_map[lp.MemAccess("global", np.dtype(np.float64), lid_strides={}, gid_strides={}, - direction='load', variable='h', + direction="load", variable="h", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) @@ -528,15 +525,15 @@ def test_mem_access_counter_specialops(): assert f32 == (2*n*m*ell)*n_subgroups assert f64 == (2*n*m)*n_subgroups - f32 = mem_map[lp.MemAccess('global', np.float32, + f32 = mem_map[lp.MemAccess("global", np.float32, lid_strides={}, gid_strides={}, - direction='store', variable='c', + direction="store", variable="c", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) - f64 = mem_map[lp.MemAccess('global', np.float64, + f64 = mem_map[lp.MemAccess("global", np.float64, lid_strides={}, gid_strides={}, - direction='store', variable='e', + direction="store", variable="e", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) @@ -545,7 +542,7 @@ def test_mem_access_counter_specialops(): assert f32 == (n*m*ell)*n_subgroups assert f64 == (n*m)*n_subgroups - filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'], + filtered_map = mem_map.filter_by(direction=["load"], variable=["a", "g"], count_granularity=CG.SUBGROUP) tot = filtered_map.eval_and_sum(params) @@ -575,34 +572,34 @@ def test_mem_access_counter_bitwise(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group - i32 = mem_map[lp.MemAccess('global', np.int32, + i32 = mem_map[lp.MemAccess("global", np.int32, lid_strides={}, gid_strides={}, - direction='load', variable='a', + direction="load", variable="a", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) - i32 += mem_map[lp.MemAccess('global', np.int32, + i32 += mem_map[lp.MemAccess("global", np.int32, lid_strides={}, gid_strides={}, - direction='load', variable='b', + direction="load", variable="b", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) - i32 += mem_map[lp.MemAccess('global', np.int32, + i32 += mem_map[lp.MemAccess("global", np.int32, lid_strides={}, gid_strides={}, - direction='load', variable='g', + direction="load", variable="g", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) - i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), + i32 += mem_map[lp.MemAccess("global", np.dtype(np.int32), lid_strides={}, gid_strides={}, - direction='load', variable='h', + direction="load", variable="h", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) @@ -610,15 +607,15 @@ def test_mem_access_counter_bitwise(): # uniform: (count-per-sub-group)*n_subgroups assert i32 == (4*n*m+2*n*m*ell)*n_subgroups - i32 = mem_map[lp.MemAccess('global', np.int32, + i32 = mem_map[lp.MemAccess("global", np.int32, lid_strides={}, gid_strides={}, - direction='store', variable='c', + direction="store", variable="c", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) - i32 += mem_map[lp.MemAccess('global', np.int32, + i32 += mem_map[lp.MemAccess("global", np.int32, lid_strides={}, gid_strides={}, - direction='store', variable='e', + direction="store", variable="e", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) @@ -650,7 +647,7 @@ def test_mem_access_counter_mixed(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} n_workgroups = div_ceil(ell, group_size_0) group_size = group_size_0 @@ -659,37 +656,37 @@ def test_mem_access_counter_mixed(): mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) - f64uniform = mem_map[lp.MemAccess('global', np.float64, + f64uniform = mem_map[lp.MemAccess("global", np.float64, lid_strides={}, gid_strides={}, - direction='load', variable='g', + direction="load", variable="g", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) - f64uniform += mem_map[lp.MemAccess('global', np.float64, + f64uniform += mem_map[lp.MemAccess("global", np.float64, lid_strides={}, gid_strides={}, - direction='load', variable='h', + direction="load", variable="h", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) - f32uniform = mem_map[lp.MemAccess('global', np.float32, + f32uniform = mem_map[lp.MemAccess("global", np.float32, lid_strides={}, gid_strides={}, - direction='load', variable='x', + direction="load", variable="x", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) - f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), - lid_strides={0: Variable('m')}, - gid_strides={0: Variable('m')*group_size_0}, - direction='load', - variable='a', + f32nonconsec = mem_map[lp.MemAccess("global", np.dtype(np.float32), + lid_strides={0: Variable("m")}, + gid_strides={0: Variable("m")*group_size_0}, + direction="load", + variable="a", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) - f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), - lid_strides={0: Variable('m')}, - gid_strides={0: Variable('m')*group_size_0}, - direction='load', - variable='b', + f32nonconsec += mem_map[lp.MemAccess("global", np.dtype(np.float32), + lid_strides={0: Variable("m")}, + gid_strides={0: Variable("m")*group_size_0}, + direction="load", + variable="b", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) @@ -715,17 +712,17 @@ def test_mem_access_counter_mixed(): else: assert f32nonconsec == 3*n*m*ell - f64uniform = mem_map[lp.MemAccess('global', np.float64, + f64uniform = mem_map[lp.MemAccess("global", np.float64, lid_strides={}, gid_strides={}, - direction='store', variable='e', + direction="store", variable="e", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) - f32nonconsec = mem_map[lp.MemAccess('global', np.float32, - lid_strides={0: Variable('m')}, - gid_strides={0: Variable('m')*group_size_0}, - direction='store', - variable='c', + f32nonconsec = mem_map[lp.MemAccess("global", np.float32, + lid_strides={0: Variable("m")}, + gid_strides={0: Variable("m")*group_size_0}, + direction="store", + variable="c", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) @@ -764,55 +761,55 @@ def test_mem_access_counter_nonconsec(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} - f64nonconsec = mem_map[lp.MemAccess('global', np.float64, - lid_strides={0: Variable('m')}, - gid_strides={0: Variable('m')*lsize0}, - direction='load', - variable='g', + params = {"n": n, "m": m, "ell": ell} + f64nonconsec = mem_map[lp.MemAccess("global", np.float64, + lid_strides={0: Variable("m")}, + gid_strides={0: Variable("m")*lsize0}, + direction="load", + variable="g", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) - f64nonconsec += mem_map[lp.MemAccess('global', np.float64, - lid_strides={0: Variable('m')}, - gid_strides={0: Variable('m')*lsize0}, - direction='load', - variable='h', + f64nonconsec += mem_map[lp.MemAccess("global", np.float64, + lid_strides={0: Variable("m")}, + gid_strides={0: Variable("m")*lsize0}, + direction="load", + variable="h", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess( - 'global', np.dtype(np.float32), - lid_strides={0: Variable('m')*Variable('ell')}, - gid_strides={0: Variable('m')*Variable('ell')*lsize0}, - direction='load', variable='a', + "global", np.dtype(np.float32), + lid_strides={0: Variable("m")*Variable("ell")}, + gid_strides={0: Variable("m")*Variable("ell")*lsize0}, + direction="load", variable="a", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess( - 'global', np.dtype(np.float32), - lid_strides={0: Variable('m')*Variable('ell')}, - gid_strides={0: Variable('m')*Variable('ell')*lsize0}, - direction='load', variable='b', + "global", np.dtype(np.float32), + lid_strides={0: Variable("m")*Variable("ell")}, + gid_strides={0: Variable("m")*Variable("ell")*lsize0}, + direction="load", variable="b", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*ell - f64nonconsec = mem_map[lp.MemAccess('global', np.float64, - lid_strides={0: Variable('m')}, - gid_strides={0: Variable('m')*lsize0}, - direction='store', - variable='e', + f64nonconsec = mem_map[lp.MemAccess("global", np.float64, + lid_strides={0: Variable("m")}, + gid_strides={0: Variable("m")*lsize0}, + direction="store", + variable="e", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess( - 'global', np.float32, - lid_strides={0: Variable('m')*Variable('ell')}, - gid_strides={0: Variable('m')*Variable('ell')*lsize0}, - direction='store', variable='c', + "global", np.float32, + lid_strides={0: Variable("m")*Variable("ell")}, + gid_strides={0: Variable("m")*Variable("ell")*lsize0}, + direction="store", variable="c", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) @@ -822,40 +819,40 @@ def test_mem_access_counter_nonconsec(): mem_map64 = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=64) f64nonconsec = mem_map64[lp.MemAccess( - 'global', + "global", np.float64, - lid_strides={0: Variable('m')}, - gid_strides={0: Variable('m')*lsize0}, - direction='load', variable='g', + lid_strides={0: Variable("m")}, + gid_strides={0: Variable("m")*lsize0}, + direction="load", variable="g", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) f64nonconsec += mem_map64[lp.MemAccess( - 'global', + "global", np.float64, - lid_strides={0: Variable('m')}, - gid_strides={0: Variable('m')*lsize0}, - direction='load', variable='h', + lid_strides={0: Variable("m")}, + gid_strides={0: Variable("m")*lsize0}, + direction="load", variable="h", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec = mem_map64[lp.MemAccess( - 'global', + "global", np.dtype(np.float32), - lid_strides={0: Variable('m')*Variable('ell')}, - gid_strides={0: Variable('m')*Variable('ell')*lsize0}, - direction='load', - variable='a', + lid_strides={0: Variable("m")*Variable("ell")}, + gid_strides={0: Variable("m")*Variable("ell")*lsize0}, + direction="load", + variable="a", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec += mem_map64[lp.MemAccess( - 'global', + "global", np.dtype(np.float32), - lid_strides={0: Variable('m')*Variable('ell')}, - gid_strides={0: Variable('m')*Variable('ell')*lsize0}, - direction='load', - variable='b', + lid_strides={0: Variable("m")*Variable("ell")}, + gid_strides={0: Variable("m")*Variable("ell")*lsize0}, + direction="load", + variable="b", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) @@ -879,39 +876,39 @@ def test_mem_access_counter_consec(): knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size='guess') + subgroup_size="guess") n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} f64consec = mem_map[lp.MemAccess( - 'global', np.float64, - lid_strides={0: 1}, gid_strides={0: Variable('m')}, - direction='load', variable='g', + "global", np.float64, + lid_strides={0: 1}, gid_strides={0: Variable("m")}, + direction="load", variable="g", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) f64consec += mem_map[lp.MemAccess( - 'global', np.float64, - lid_strides={0: 1}, gid_strides={0: Variable('m')}, - direction='load', variable='h', + "global", np.float64, + lid_strides={0: 1}, gid_strides={0: Variable("m")}, + direction="load", variable="h", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess( - 'global', np.float32, + "global", np.float32, lid_strides={0: 1}, - gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, - direction='load', variable='a', + gid_strides={0: Variable("m")*Variable("ell"), 1: Variable("m")}, + direction="load", variable="a", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) f32consec += mem_map[lp.MemAccess( - 'global', np.dtype(np.float32), + "global", np.dtype(np.float32), lid_strides={0: 1}, - gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, - direction='load', variable='b', + gid_strides={0: Variable("m")*Variable("ell"), 1: Variable("m")}, + direction="load", variable="b", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) @@ -919,17 +916,17 @@ def test_mem_access_counter_consec(): assert f32consec == 3*n*m*ell f64consec = mem_map[lp.MemAccess( - 'global', np.float64, - lid_strides={0: 1}, gid_strides={0: Variable('m')}, - direction='store', variable='e', + "global", np.float64, + lid_strides={0: 1}, gid_strides={0: Variable("m")}, + direction="store", variable="e", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess( - 'global', np.float32, + "global", np.float32, lid_strides={0: 1}, - gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, - direction='store', variable='c', + gid_strides={0: Variable("m")*Variable("ell"), 1: Variable("m")}, + direction="store", variable="c", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) @@ -945,7 +942,7 @@ def test_count_granularity_val_checks(): lp.MemAccess(count_granularity=CG.WORKGROUP) lp.MemAccess(count_granularity=None) assert True - lp.MemAccess(count_granularity='bushel') + lp.MemAccess(count_granularity="bushel") assert False except ValueError: assert True @@ -956,7 +953,7 @@ def test_count_granularity_val_checks(): lp.Op(count_granularity=CG.WORKGROUP) lp.Op(count_granularity=None) assert True - lp.Op(count_granularity='bushel') + lp.Op(count_granularity="bushel") assert False except ValueError: assert True @@ -980,7 +977,7 @@ def test_barrier_counter_nobarriers(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} assert len(sync_map) == 1 assert sync_map.filter_by(kind="kernel_launch").eval_and_sum(params) == 1 @@ -1006,7 +1003,7 @@ def test_barrier_counter_barriers(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} barrier_count = sync_map.filter_by(kind="barrier_local").eval_and_sum(params) assert barrier_count == 50*10*2 @@ -1044,7 +1041,7 @@ def test_all_counters_parallel_matmul(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} group_size = bsize*bsize n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize) subgroups_per_group = div_ceil(group_size, SGS) @@ -1057,16 +1054,16 @@ def test_all_counters_parallel_matmul(): op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) f32mul = op_map[ - lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name) + lp.Op(np.float32, "mul", CG.SUBGROUP, knl.name) ].eval_with_dict(params) f32add = op_map[ - lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name) + lp.Op(np.float32, "add", CG.SUBGROUP, knl.name) ].eval_with_dict(params) i32ops = op_map[ - lp.Op(np.int32, 'add', CG.SUBGROUP, knl.name) + lp.Op(np.int32, "add", CG.SUBGROUP, knl.name) ].eval_with_dict(params) i32ops += op_map[ - lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP, knl.name) + lp.Op(np.dtype(np.int32), "mul", CG.SUBGROUP, knl.name) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups @@ -1075,28 +1072,28 @@ def test_all_counters_parallel_matmul(): mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) - f32s1lb = mem_access_map[lp.MemAccess('global', np.float32, - lid_strides={0: 1, 1: Variable('ell')}, + f32s1lb = mem_access_map[lp.MemAccess("global", np.float32, + lid_strides={0: 1, 1: Variable("ell")}, gid_strides={1: bsize}, - direction='load', variable='b', + direction="load", variable="b", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) - f32s1la = mem_access_map[lp.MemAccess('global', np.float32, - lid_strides={0: 1, 1: Variable('m')}, - gid_strides={0: Variable('m')*bsize}, - direction='load', - variable='a', count_granularity=CG.WORKITEM, + f32s1la = mem_access_map[lp.MemAccess("global", np.float32, + lid_strides={0: 1, 1: Variable("m")}, + gid_strides={0: Variable("m")*bsize}, + direction="load", + variable="a", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) assert f32s1lb == n*m*ell/bsize assert f32s1la == n*m*ell/bsize - f32coal = mem_access_map[lp.MemAccess('global', np.float32, - lid_strides={0: 1, 1: Variable('ell')}, - gid_strides={0: Variable('ell')*bsize, 1: bsize}, - direction='store', variable='c', + f32coal = mem_access_map[lp.MemAccess("global", np.float32, + lid_strides={0: 1, 1: Variable("ell")}, + gid_strides={0: Variable("ell")*bsize, 1: bsize}, + direction="store", variable="c", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) @@ -1105,26 +1102,26 @@ def test_all_counters_parallel_matmul(): local_mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=SGS).filter_by(mtype=['local']) + subgroup_size=SGS).filter_by(mtype=["local"]) - local_mem_l = local_mem_map.filter_by(direction=['load'] + local_mem_l = local_mem_map.filter_by(direction=["load"] ).eval_and_sum(params) # (count-per-sub-group)*n_subgroups assert local_mem_l == m*2*n_subgroups - local_mem_l_a = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), - direction='load', + local_mem_l_a = local_mem_map[lp.MemAccess("local", np.dtype(np.float32), + direction="load", lid_strides={1: 16}, gid_strides={}, - variable='a_fetch', + variable="a_fetch", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) - local_mem_l_b = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), - direction='load', + local_mem_l_b = local_mem_map[lp.MemAccess("local", np.dtype(np.float32), + direction="load", lid_strides={0: 1}, gid_strides={}, - variable='b_fetch', + variable="b_fetch", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) @@ -1132,7 +1129,7 @@ def test_all_counters_parallel_matmul(): # (count-per-sub-group)*n_subgroups assert local_mem_l_a == local_mem_l_b == m*n_subgroups - local_mem_s = local_mem_map.filter_by(direction=['store'] + local_mem_s = local_mem_map.filter_by(direction=["store"] ).eval_and_sum(params) # (count-per-sub-group)*n_subgroups @@ -1200,7 +1197,7 @@ def test_mem_access_tagged_variables(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} group_size = bsize*bsize n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize) subgroups_per_group = div_ceil(group_size, SGS) @@ -1209,20 +1206,20 @@ def test_mem_access_tagged_variables(): mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) - f32s1lb = mem_access_map[lp.MemAccess('global', np.float32, + f32s1lb = mem_access_map[lp.MemAccess("global", np.float32, lid_strides={0: 1}, gid_strides={1: bsize}, - direction='load', variable='b', - variable_tag='mmbload', + direction="load", variable="b", + variable_tag="mmbload", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) - f32s1la = mem_access_map[lp.MemAccess('global', np.float32, - lid_strides={1: Variable('m')}, - gid_strides={0: Variable('m')*bsize}, - direction='load', - variable='a', - variable_tag='mmaload', + f32s1la = mem_access_map[lp.MemAccess("global", np.float32, + lid_strides={1: Variable("m")}, + gid_strides={0: Variable("m")*bsize}, + direction="load", + variable="a", + variable_tag="mmaload", count_granularity=CG.SUBGROUP, kernel_name=knl.name) ].eval_with_dict(params) @@ -1232,11 +1229,11 @@ def test_mem_access_tagged_variables(): # uniform: (count-per-sub-group)*n_subgroups assert f32s1la == m*n_subgroups - f32coal = mem_access_map[lp.MemAccess('global', np.float32, - lid_strides={0: 1, 1: Variable('ell')}, - gid_strides={0: Variable('ell')*bsize, 1: bsize}, - direction='store', variable='c', - variable_tag='mmresult', + f32coal = mem_access_map[lp.MemAccess("global", np.float32, + lid_strides={0: 1, 1: Variable("ell")}, + gid_strides={0: Variable("ell")*bsize, 1: bsize}, + direction="store", variable="c", + variable_tag="mmresult", count_granularity=CG.WORKITEM, kernel_name=knl.name) ].eval_with_dict(params) @@ -1256,7 +1253,7 @@ def test_gather_access_footprint(): from loopy.statistics import gather_access_footprints, count fp = gather_access_footprints(knl) - for key, footprint in six.iteritems(fp): + for key, footprint in fp.item(): print(key, count(knl.root_kernel, footprint)) @@ -1271,7 +1268,7 @@ def test_gather_access_footprint_2(): fp = gather_access_footprints(knl) params = {"n": 200} - for key, footprint in six.iteritems(fp): + for key, footprint in fp.items(): assert count(knl.root_kernel, footprint).eval_with_dict(params) == 200 print(key, count(knl.root_kernel, footprint)) @@ -1294,7 +1291,7 @@ def test_summations_and_filters(): n = 512 m = 256 ell = 128 - params = {'n': n, 'm': m, 'ell': ell} + params = {"n": n, "m": m, "ell": ell} n_workgroups = 1 group_size = 1 @@ -1304,24 +1301,24 @@ def test_summations_and_filters(): mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) - loads_a = mem_map.filter_by(direction=['load'], variable=['a'], + loads_a = mem_map.filter_by(direction=["load"], variable=["a"], count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert loads_a == (2*n*m*ell)*n_subgroups - global_stores = mem_map.filter_by(mtype=['global'], direction=['store'], + global_stores = mem_map.filter_by(mtype=["global"], direction=["store"], count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert global_stores == (n*m*ell + n*m)*n_subgroups - ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'], + ld_bytes = mem_map.filter_by(mtype=["global"], direction=["load"], count_granularity=[CG.SUBGROUP] ).to_bytes().eval_and_sum(params) - st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'], + st_bytes = mem_map.filter_by(mtype=["global"], direction=["store"], count_granularity=[CG.SUBGROUP] ).to_bytes().eval_and_sum(params) @@ -1330,10 +1327,10 @@ def test_summations_and_filters(): assert st_bytes == (4*n*m*ell + 8*n*m)*n_subgroups # ignore stride and variable names in this map - reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') - f32lall = reduced_map[lp.MemAccess('global', np.float32, direction='load') + reduced_map = mem_map.group_by("mtype", "dtype", "direction") + f32lall = reduced_map[lp.MemAccess("global", np.float32, direction="load") ].eval_with_dict(params) - f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load') + f64lall = reduced_map[lp.MemAccess("global", np.float64, direction="load") ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -1345,7 +1342,7 @@ def test_summations_and_filters(): #for k, v in op_map.items(): # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v) - op_map_dtype = op_map.group_by('dtype') + op_map_dtype = op_map.group_by("dtype") f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params) i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params) @@ -1353,7 +1350,7 @@ def test_summations_and_filters(): assert f64 == n*m assert i32 == n*m*2 - addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params) + addsub_all = op_map.filter_by(name=["add", "sub"]).eval_and_sum(params) f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params) assert addsub_all == n*m*ell + n*m*2 assert f32ops_all == n*m*ell*3 @@ -1361,16 +1358,16 @@ def test_summations_and_filters(): non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params) assert non_field == 0 - ops_nodtype = op_map.group_by('name') - ops_noname = op_map.group_by('dtype') - mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params) + ops_nodtype = op_map.group_by("name") + ops_noname = op_map.group_by("dtype") + mul_all = ops_nodtype[lp.Op(name="mul")].eval_with_dict(params) f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params) assert mul_all == n*m*ell + n*m assert f64ops_all == n*m def func_filter(key): return key.lid_strides == {} and key.dtype == to_loopy_type(np.float64) and \ - key.direction == 'load' + key.direction == "load" f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups @@ -1394,7 +1391,7 @@ def test_strided_footprint(): knl = lp.split_iname(knl, "i_inner", bx, outer_tag="unr", inner_tag="l.0") footprints = lp.gather_access_footprints(knl) - x_l_foot = footprints[('x', 'read')] + x_l_foot = footprints[("x", "read")] from loopy.statistics import count num = count(knl.root_kernel, x_l_foot).eval_with_dict(param_dict) diff --git a/test/test_target.py b/test/test_target.py index 0d34310664e027f8e7ee133da871c91723295d10..e5b743d37fcae25db088854199ae8f15ed387d8a 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import, print_function - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -48,7 +46,7 @@ from pyopencl.tools import pytest_generate_tests_for_pyopencl \ __all__ = [ "pytest_generate_tests", - "cl" # 'cl.create_some_context' + "cl" # "cl.create_some_context" ] @@ -225,8 +223,9 @@ def test_tuple(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) + import islpy as isl knl = lp.make_kernel( - "{ [i]: 0 = i }", + [isl.BasicSet("[] -> {[]: }")], """ a, b = make_tuple(1, 2.) """) @@ -272,9 +271,11 @@ def test_numba_cuda_target(): target=lp.NumbaCudaTarget()) knl = lp.assume(knl, "M>0") - knl = lp.split_iname(knl, "i", 16, outer_tag='g.0') - knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1)) - knl = lp.add_prefetch(knl, "X[i,:]", default_tag="l.auto") + knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") + knl = lp.split_iname(knl, "j", 128, inner_tag="l.0", slabs=(0, 1)) + knl = lp.add_prefetch(knl, "X[i,:]", + fetch_outer_inames="i_inner, i_outer, j_inner", + default_tag="l.auto") knl = lp.fix_parameters(knl, N=3) knl = lp.prioritize_loops(knl, "i_inner,j_outer") knl = lp.tag_inames(knl, "k:unr") @@ -318,7 +319,7 @@ def test_child_invalid_type_cast(): def test_target_invalid_type_cast(): - dtype = np.dtype([('', ' 1: exec(sys.argv[1]) diff --git a/test/test_transform.py b/test/test_transform.py index 9300f45c33b80195facc70a44c360363a69b2396..ff593a0c85852701ada5cbecee6d4869941c29af 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import, print_function - __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -46,7 +44,7 @@ from pyopencl.tools import pytest_generate_tests_for_pyopencl \ __all__ = [ "pytest_generate_tests", - "cl" # 'cl.create_some_context' + "cl" # "cl.create_some_context" ] @@ -75,7 +73,7 @@ def test_collect_common_factors(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( - "{[i,j,k]: 0<=i,j out_tmp = 0 {id=out_init,inames=i} out_tmp = out_tmp + alpha[i]*a[i,j]*b1[j] {id=out_up1,dep=out_init} @@ -99,8 +97,8 @@ def test_to_batched(ctx_factory): queue = cl.CommandQueue(ctx) knl = lp.make_kernel( - ''' { [i,j]: 0<=i,j {[i,j]: 0<=i,j alpha = 2.0 {id=init_alpha} + for i + for j + c[i, j] = alpha*a[i]*b[j] {id=outerproduct} + end + end + """ + ], + [ + lp.GlobalArg("a", dtype, shape=("n",), order=order), + lp.GlobalArg("b", dtype, shape=("n",), order=order), + lp.GlobalArg("c", dtype, shape=("n, n"), order=order), + lp.ValueArg("n", np.int32, approximately=n), + ], + name="rank_one", + assumptions="n >= 16", + lang_version=(2018, 2)) + + ref_knl = knl + + knl = lp.split_iname(knl, "i", 16, + outer_tag="g.0", inner_tag="l.0") + knl = lp.split_iname(knl, "j", 16, + outer_tag="g.1", inner_tag="l.1") + + knl = lp.add_prefetch(knl, "a") + knl = lp.add_prefetch(knl, "b") + + knl = lp.add_inames_for_unused_hw_axes(knl) + + assert knl.id_to_insn["init_alpha"].within_inames == frozenset(["i_inner", + "i_outer", "j_outer", "j_inner"]) + assert knl.id_to_insn["a_fetch_rule"].within_inames == frozenset(["i_inner", + "i_outer", "j_outer", "j_inner"]) + assert knl.id_to_insn["b_fetch_rule"].within_inames == frozenset(["i_inner", + "i_outer", "j_outer", "j_inner"]) + + lp.auto_test_vs_ref(ref_knl, ctx, knl, + op_count=[np.dtype(dtype).itemsize*n**2/1e9], op_label=["GBytes"], + parameters={"n": n}) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/testlib.py b/test/testlib.py index c66367a7ccaeae2794c3f5ffa82e5670ada721c2..2d2a535fb2369e526c4b9304a60d680763cd8461 100644 --- a/test/testlib.py +++ b/test/testlib.py @@ -4,7 +4,7 @@ import numpy as np # {{{ test_barrier_in_overridden_get_grid_size_expanded_kernel -class GridOverride(object): +class GridOverride: def __init__(self, clean, vecsize): self.clean = clean self.vecsize = vecsize @@ -59,15 +59,15 @@ class SeparateTemporariesPreambleTestMangler( # check types if len(arg_dtypes) != len(arg_dtypes): - raise Exception('Unexpected number of arguments provided to mangler ' - '{}, expected {}, got {}'.format( + raise Exception("Unexpected number of arguments provided to mangler " + "{}, expected {}, got {}".format( self.func_name, len(self.func_arg_dtypes), len(arg_dtypes))) for i, (d1, d2) in enumerate(zip(self.func_arg_dtypes, arg_dtypes)): if not __compare(d1, d2): - raise Exception('Argument at index {} for mangler {} does not ' - 'match expected dtype. Expected {}, got {}'. + raise Exception("Argument at index {} for mangler {} does not " + "match expected dtype. Expected {}, got {}". format(i, self.func_name, str(d1), str(d2))) # get target for creation @@ -87,7 +87,7 @@ class SeparateTemporariesPreambleTestPreambleGenerator( func_match = next( (x for x in preamble_info.seen_functions if x.name == self.func_name), None) - desc = 'custom_funcs_indirect' + desc = "custom_funcs_indirect" if func_match is not None: from loopy.types import to_loopy_type # check types @@ -95,7 +95,7 @@ class SeparateTemporariesPreambleTestPreambleGenerator( func_match.arg_dtypes: # if match, create our temporary var = lp.TemporaryVariable( - 'lookup', initializer=self.arr, dtype=self.arr.dtype, + "lookup", initializer=self.arr, dtype=self.arr.dtype, shape=self.arr.shape, address_space=lp.AddressSpace.GLOBAL, read_only=True) # and code @@ -129,7 +129,7 @@ class SeparateTemporariesPreambleTestPreambleGenerator( decl = Initializer(decl, generate_array_literal( codegen_state, var, var.initializer)) # return generated code - yield (desc, '\n'.join([str(decl), code])) + yield (desc, "\n".join([str(decl), code])) # }}}