diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d0481192817877edea3b8deaaaf86b480fab2a11
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,63 @@
+name: CI
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ paths-ignore:
+ - 'doc/*.rst'
+ schedule:
+ - cron: '17 3 * * 0'
+
+jobs:
+ flake8:
+ name: Flake8
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ -
+ uses: actions/setup-python@v1
+ with:
+ python-version: '3.x'
+ - name: "Main Script"
+ run: |
+ curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
+ . ./prepare-and-run-flake8.sh ./loopy ./test
+
+ pylint:
+ name: Pylint
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - name: "Main Script"
+ run: |
+ sed 's/python=3/python=3.7/' .test-conda-env-py3.yml > .test-conda-env.yml
+ CONDA_ENVIRONMENT=.test-conda-env.yml
+ USE_CONDA_BUILD=1
+ curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh
+ . ./prepare-and-run-pylint.sh loopy test/test_*.py
+
+ pytest3:
+ name: Conda Pytest Py3
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - name: "Main Script"
+ run: |
+ CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+ curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
+ . ./build-and-test-py-project-within-miniconda.sh
+
+ pytest_twice:
+ name: Pytest twice (for cache behavior) on Py${{ matrix.python-version }}
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - name: "Main Script"
+ run: |
+ CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+ curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
+ . ./build-and-test-py-project-within-miniconda.sh
+ ${PY_EXE} -m pytest -rw --durations=10 --tb=native --junitxml=pytest.xml -rxs $TESTABLES
+
+# vim: sw=4
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index c11e507ee79cdc6f1567acbf6c12bbd7ed22f1cc..48bee8638df08ebe8c03a17f84c78851ff36466e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,26 +1,7 @@
-Python 2.7 POCL:
- script:
- - export PY_EXE=python2.7
- - export PYOPENCL_TEST=portable
- - export EXTRA_INSTALL="pybind11 numpy mako"
- - export LOOPY_NO_CACHE=1
- - export NO_DOCTESTS=1
- - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
- - ". ./build-and-test-py-project.sh"
- tags:
- - python2.7
- - pocl
- except:
- - tags
- artifacts:
- reports:
- junit: test/pytest.xml
-
-
Python 3 POCL:
script:
- export PY_EXE=python3
- - export PYOPENCL_TEST=portable
+ - export PYOPENCL_TEST=portable:pthread
- export EXTRA_INSTALL="pybind11 numpy mako"
- export LOOPY_NO_CACHE=1
- curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
@@ -57,7 +38,7 @@ Python 3 Intel:
Python 3 POCL Twice With Cache:
script:
- export PY_EXE=python3
- - export PYOPENCL_TEST=portable
+ - export PYOPENCL_TEST=portable:pthread
- export EXTRA_INSTALL="pybind11 numpy mako"
- curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
- ". ./build-and-test-py-project.sh"
@@ -77,7 +58,7 @@ Python 3 POCL Twice With Cache:
# PyPy POCL:
# script:
# - export PY_EXE=pypy
-# - export PYOPENCL_TEST=portable
+# - export PYOPENCL_TEST=portable:pthread
# - export EXTRA_INSTALL="pybind11 numpy mako"
# - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
# - ". ./build-and-test-py-project.sh"
@@ -90,7 +71,7 @@ Python 3 POCL Twice With Cache:
Python 3 POCL Examples:
script:
- export PY_EXE=python3
- - export PYOPENCL_TEST=portable
+ - export PYOPENCL_TEST=portable:pthread
- export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert"
- ". ./build-py-project-and-run-examples.sh"
tags:
@@ -114,20 +95,6 @@ Pylint:
except:
- tags
-CentOS binary:
- script:
- - (cd build-helpers; ./make-linux-build-docker.sh --nodate)
- - (cd ./build-helpers; ./loopy-centos6 ../examples/fortran/sparse.floopy)
- artifacts:
- expire_in: 4 weeks
- paths:
- - build-helpers/loopy-centos6
- tags:
- - docker
- only:
- - master
- retry: 2
-
Documentation:
script:
- EXTRA_INSTALL="pybind11 numpy"
diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml
index a1fe086b4ac4562aaa8fafd32657aebbd1068e8a..ccbbc933aae2d3c0a28d7d30f178661950c76542 100644
--- a/.test-conda-env-py3.yml
+++ b/.test-conda-env-py3.yml
@@ -1,12 +1,12 @@
name: test-conda-env
channels:
- conda-forge
-- defaults
+- nodefaults
dependencies:
- python=3
- git
-- conda-forge::numpy
+- numpy
- pocl
- mako
- pyopencl
diff --git a/README.rst b/README.rst
index fe7eb751a7144d9758df91914b643392de421450..3240983638e1f6f96ba7fec410c5c893db19c044 100644
--- a/README.rst
+++ b/README.rst
@@ -4,9 +4,9 @@ Loopy: Transformation-Based Generation of High-Performance CPU/GPU Code
.. image:: https://gitlab.tiker.net/inducer/loopy/badges/master/pipeline.svg
:alt: Gitlab Build Status
:target: https://gitlab.tiker.net/inducer/loopy/commits/master
-.. image:: https://dev.azure.com/ak-spam/inducer/_apis/build/status/inducer.loopy?branchName=master
- :alt: Azure Build Status
- :target: https://dev.azure.com/ak-spam/inducer/_build/latest?definitionId=10&branchName=master
+.. image:: https://github.com/inducer/loopy/workflows/CI/badge.svg?branch=master&event=push
+ :alt: Github Build Status
+ :target: https://github.com/inducer/loopy/actions?query=branch%3Amaster+workflow%3ACI+event%3Apush
.. image:: https://badge.fury.io/py/loo.py.png
:alt: Python Package Index Release Page
:target: https://pypi.org/project/loo.py/
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
deleted file mode 100644
index 0dfb2455568b275b40e699683071da3a1cd2f483..0000000000000000000000000000000000000000
--- a/azure-pipelines.yml
+++ /dev/null
@@ -1,114 +0,0 @@
-jobs:
--
- job: 'Python2'
- pool:
- vmImage: 'ubuntu-latest'
-
- steps:
- -
- script: |
- set -e
- sed 's/python=3/python=2.7/' .test-conda-env-py3.yml > .test-conda-env-py2.yml
- cat .test-conda-env-py2.yml
- CONDA_ENVIRONMENT=.test-conda-env-py2.yml
- curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
- . ./build-and-test-py-project-within-miniconda.sh
-
- displayName: 'Pytest Conda'
- -
- task: PublishTestResults@2
- inputs:
- testResultsFormat: 'JUnit'
- testResultsFiles: 'test/pytest.xml'
-
--
- job: 'Python3'
- pool:
- vmImage: 'ubuntu-latest'
-
- steps:
- -
- script: |
- set -e
- CONDA_ENVIRONMENT=.test-conda-env-py3.yml
- curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
- . ./build-and-test-py-project-within-miniconda.sh
-
- displayName: 'Pytest Conda'
-
- -
- task: PublishTestResults@2
- inputs:
- testResultsFormat: 'JUnit'
- testResultsFiles: 'test/pytest.xml'
-
--
- job: 'Python3Twice'
- displayName: "Python3 - run tests twice to test cache behavior"
- pool:
- vmImage: 'ubuntu-latest'
-
- steps:
- -
- script: |
- set -e
- CONDA_ENVIRONMENT=.test-conda-env-py3.yml
- curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
- . ./build-and-test-py-project-within-miniconda.sh
- ${PY_EXE} -m pytest -rw --durations=10 --tb=native --junitxml=pytest.xml -rxs $TESTABLES
-
- displayName: 'Pytest Conda'
-
- -
- task: PublishTestResults@2
- inputs:
- testResultsFormat: 'JUnit'
- testResultsFiles: 'test/pytest.xml'
-
--
- job: 'Flake8'
- pool:
- vmImage: 'ubuntu-latest'
- strategy:
- matrix:
- Python37:
- python.version: '3.7'
-
- steps:
- -
- task: UsePythonVersion@0
- inputs:
- versionSpec: '$(python.version)'
-
- -
- script: |
- set -e
- curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
- . ./prepare-and-run-flake8.sh loopy test
-
- displayName: 'Flake8'
-
--
- job: 'Pylint'
- pool:
- vmImage: 'ubuntu-latest'
-
- steps:
- -
- script: |
- set -e
- sed 's/python=3/python=3.7/' .test-conda-env-py3.yml > .test-conda-env.yml
- CONDA_ENVIRONMENT=.test-conda-env.yml
- USE_CONDA_BUILD=1
- curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh
- . ./prepare-and-run-pylint.sh loopy test/test_*.py
-
- displayName: 'Pylint'
-
-schedules:
--
- cron: "0 0 * * 0"
- displayName: Weekly build
- branches:
- include:
- - master
diff --git a/build-helpers/.gitignore b/build-helpers/.gitignore
deleted file mode 100644
index fef83014eecb14936006b90afc65595dd7d30b77..0000000000000000000000000000000000000000
--- a/build-helpers/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-loopy-*-20[0-9][0-9]*
diff --git a/build-helpers/loopy.spec b/build-helpers/loopy.spec
deleted file mode 100644
index 08c0b6efe0efd3ad419b6565fd396c2f805eeab7..0000000000000000000000000000000000000000
--- a/build-helpers/loopy.spec
+++ /dev/null
@@ -1,70 +0,0 @@
-# -*- mode: python -*-
-
-from os.path import basename, dirname, join
-from glob import glob
-
-single_file = True
-
-# This makes the executable spew debug info.
-debug = False
-
-from os.path import expanduser
-
-import packaging # pip install packaging to add
-
-a = Analysis(['../bin/loopy'],
- pathex=[expanduser('~/src/loopy')],
- hiddenimports=[
- "decorator",
- "appdirs",
- "packaging.markers",
- "packaging.specifiers",
- "packaging.version",
- "packaging.requirements",
- ],
- hookspath=None,
- runtime_hooks=None,
- excludes=["hedge", "meshpy", "pyopencl", "PIL"]
- )
-
-import ply.lex
-import ply.yacc
-
-
-a.datas += [
- (join("py-src", "ply", "lex", basename(fn)), fn, "DATA")
- for fn in glob(join(dirname(ply.lex.__file__), "*.py"))
- ] + [
- (join("py-src", "ply", "yacc", basename(fn)), fn, "DATA")
- for fn in glob(join(dirname(ply.yacc.__file__), "*.py"))
- ]
-
-pyz = PYZ(a.pure)
-
-if single_file:
- exe = EXE(pyz,
- a.scripts,
- a.binaries,
- a.zipfiles,
- a.datas,
- name='loopy',
- debug=debug,
- strip=None,
- upx=True,
- console=True)
-else:
- exe = EXE(pyz,
- a.scripts,
- exclude_binaries=True,
- name='loopy',
- debug=debug,
- strip=None,
- upx=True,
- console=True)
- coll = COLLECT(exe,
- a.binaries,
- a.zipfiles,
- a.datas,
- strip=None,
- upx=True,
- name='loopy')
diff --git a/build-helpers/make-linux-build-docker-inner-part-2.sh b/build-helpers/make-linux-build-docker-inner-part-2.sh
deleted file mode 100755
index 035634b16072e0188270abd8736dab99ce31dada..0000000000000000000000000000000000000000
--- a/build-helpers/make-linux-build-docker-inner-part-2.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#! /bin/bash
-
-set -e
-set -x
-
-VENV_VERSION="virtualenv-15.2.0"
-rm -Rf "$VENV_VERSION"
-curl -k https://files.pythonhosted.org/packages/b1/72/2d70c5a1de409ceb3a27ff2ec007ecdd5cc52239e7c74990e32af57affe9/$VENV_VERSION.tar.gz | tar xfz -
-
-$VENV_VERSION/virtualenv.py --system-site-packages --no-setuptools .env
-
-source .env/bin/activate
-
-curl -k https://bootstrap.pypa.io/ez_setup.py | python -
-curl -k https://gitlab.tiker.net/inducer/pip/raw/7.0.3/contrib/get-pip.py | python -
-
-pip install packaging
-
-PYTHON_VER=$(python -c 'import sys; print(".".join(str(s) for s in sys.version_info[:2]))')
-pip install git+https://github.com/pyinstaller/pyinstaller.git@413c37bec126c0bd26084813593f65128966b4b7
-
-git clone --recursive git://github.com/inducer/loopy
-cd loopy
-
-grep -v pyopencl requirements.txt > myreq.txt
-
-# needed for pyinstaller package to be usable
-echo packaging >> myreq.txt
-
-pip install -r myreq.txt
-python setup.py install
-
-chown -R user /tmp/build
-
-su user -p -c "cd /tmp/build && source .env/bin/activate && cd loopy && ./build-helpers/run-pyinstaller.sh"
diff --git a/build-helpers/make-linux-build-docker-inner.sh b/build-helpers/make-linux-build-docker-inner.sh
deleted file mode 100755
index a7f621b1ef21676898d2283d93f8a54f086e5d9d..0000000000000000000000000000000000000000
--- a/build-helpers/make-linux-build-docker-inner.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#! /bin/bash
-
-set -e
-set -x
-
-mkdir /tmp/build
-cd /tmp/build
-
-useradd -d /home/user -m -s /bin/bash user
-
-yum install -y centos-release-scl
-yum install -y git python27 python27-python-devel python27-numpy tar gcc gcc-c++ mercurial libffi-devel
-
-scl enable python27 /mnt/make-linux-build-docker-inner-part-2.sh
-
diff --git a/build-helpers/make-linux-build-docker.sh b/build-helpers/make-linux-build-docker.sh
deleted file mode 100755
index fb0cfb587d654698800bfdc827259691bc056fb7..0000000000000000000000000000000000000000
--- a/build-helpers/make-linux-build-docker.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#! /bin/bash
-
-# should be run in this directory (build-helpers)
-
-if test "$1" = "--nodate"; then
- TGT_NAME=loopy-centos6
-else
- TGT_NAME=loopy-centos6-$(date +"%Y-%m-%d")
-fi
-
-echo "Generating $TGT_NAME..."
-
-set -e
-set -x
-
-docker pull centos:6
-
-CNT=$(docker create -t -v $(pwd):/mnt centos:6 /mnt/make-linux-build-docker-inner.sh)
-echo "working in container $CNT"
-
-docker start -i $CNT
-
-docker cp $CNT:/tmp/build/loopy/dist/loopy $(pwd) || true
-
-mv loopy $TGT_NAME
-
-docker rm $CNT
-
diff --git a/build-helpers/run-pyinstaller.sh b/build-helpers/run-pyinstaller.sh
deleted file mode 100755
index 50f9d85dccc503be2a2ccfb6c0e3d6aa28216981..0000000000000000000000000000000000000000
--- a/build-helpers/run-pyinstaller.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#! /bin/bash
-
-# run this from the loopy root directory
-
-rm -Rf dist build
-
-pyinstaller \
- --workpath=build/pyinstaller \
- build-helpers/loopy.spec
diff --git a/build-helpers/upload.sh b/build-helpers/upload.sh
deleted file mode 100755
index 57b8a873b9395954d76a8fd16f8ca9a261e8baa3..0000000000000000000000000000000000000000
--- a/build-helpers/upload.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#! /bin/bash
-
-set -e
-
-scp "$1" tiker.net:public_html/pub/loopy-binaries/
diff --git a/doc/index.rst b/doc/index.rst
index b77bbb16f413defe5010c75d28464051553b4486..8f114eb72cdc530dd4109257c4981118c5046f06 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -20,29 +20,6 @@ When you run this script, the following kernel is generated, compiled, and execu
.. _static-binary:
-Want to try out loopy?
-----------------------
-
-There's no need to go through :ref:`installation` if you'd just like to get a
-feel for what loopy is. Instead, you may
-`download a self-contained Linux binary `_.
-This is purposefully built on an ancient Linux distribution, so it should work
-on most versions of Linux that are currently out there.
-
-Once you have the binary, do the following::
-
- chmod +x ./loopy-centos6
- ./loopy-centos6 --target=opencl hello-loopy.loopy
- ./loopy-centos6 --target=cuda hello-loopy.loopy
- ./loopy-centos6 --target=ispc hello-loopy.loopy
-
-Grab the example here: :download:`examples/python/hello-loopy.loopy <../examples/python/hello-loopy.loopy>`.
-
-You may also donwload the most recent version by going to the `list of builds
-`_, clicking on the newest one
-of type "CentOS binary", clicking on "Browse" under "Build Artifacts", then
-navigating to "build-helpers", and downloading the binary from there.
-
Places on the web related to Loopy
----------------------------------
diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index 409cbef576d654be973dd6d1424ac40d3ea60982..af35221ad5dcd736190e40a454656a7fa069a787 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -151,6 +151,42 @@ Tag Meaning
.. }}}
+Identifiers
+-----------
+
+Reserved Identifiers
+^^^^^^^^^^^^^^^^^^^^
+
+The identifier prefix ``_lp_`` is reserved for internal usage; when creating
+*inames*, *argument names*, *temporary variable names*, *substitution rule
+names*, *instruction IDs*, and other identifiers, users should *not* use names
+beginning with ``_lp_``. This prefix is used for identifiers created
+internally when operating on Loopy's kernel IR. For Loopy developers, further
+information on name prefixes used within submodules is below.
+
+Identifier Registry
+^^^^^^^^^^^^^^^^^^^
+
+Functionality in :mod:`loopy` *must* use identifiers beginning with ``_lp_`` for
+all internally-created identifiers. Additionally, each name beginning with
+``_lp_`` must start with one of the reserved prefixes below. New prefixes may
+be registered by adding them to the table below. New prefixes may not themselves
+be the prefix of an existing prefix.
+
+**Reserved Identifier Prefixes**
+
+======================= ==================================
+Reserved Prefix Usage (module or purpose)
+======================= ==================================
+``_lp_linchk_`` :mod:`loopy.linearization.checker`
+======================= ==================================
+
+.. note::
+
+ Existing Loopy code may not yet fully satisfy these naming requirements.
+ Name changes are in progress, and prefixes will be added to this registry
+ as they are created.
+
.. _instructions:
Instructions
diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst
index 740c5cb5848dbb7c6f657011bfc23fa88ca173ec..57d33b53999e06cbb07cc8363bbc46c091033cb3 100644
--- a/doc/ref_transform.rst
+++ b/doc/ref_transform.rst
@@ -118,7 +118,7 @@ Finishing up
.. autofunction:: generate_loop_schedules
-.. autofunction:: get_one_scheduled_kernel
+.. autofunction:: get_one_linearized_kernel
.. autofunction:: save_and_reload_temporaries
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 753b09b5da42835b88a000bc0400fa18a254d80f..1b017f701f8161e93c4fdc1c14644dfe4b4fa74c 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1204,16 +1204,16 @@ Here is what happens when we try to generate code for the kernel:
This happens due to the kernel splitting done by :mod:`loopy`. The splitting
happens when the instruction schedule is generated. To see the schedule, we
-should call :func:`loopy.get_one_scheduled_kernel`:
+should call :func:`loopy.get_one_linearized_kernel`:
- >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
+ >>> knl = lp.get_one_linearized_kernel(lp.preprocess_kernel(knl))
>>> print(knl)
---------------------------------------------------------------------------
KERNEL: rotate_v2
---------------------------------------------------------------------------
...
---------------------------------------------------------------------------
- SCHEDULE:
+ LINEARIZATION:
0: CALL KERNEL rotate_v2(extra_args=[], extra_inames=[])
1: tmp = arr[i_inner + i_outer*16] {id=maketmp}
2: RETURN FROM KERNEL rotate_v2
@@ -1233,12 +1233,12 @@ goes for local temporaries).
:func:`loopy.save_and_reload_temporaries` for the purpose of handling the
task of saving and restoring temporary values across global barriers. This
function adds instructions to the kernel without scheduling them. That means
-that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to
+that :func:`loopy.get_one_linearized_kernel` needs to be called one more time to
put those instructions into the schedule.
- >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
+ >>> knl = lp.get_one_linearized_kernel(lp.preprocess_kernel(knl))
>>> knl = lp.save_and_reload_temporaries(knl)
- >>> knl = lp.get_one_scheduled_kernel(knl) # Schedule added instructions
+ >>> knl = lp.get_one_linearized_kernel(knl) # Schedule added instructions
>>> print(knl)
---------------------------------------------------------------------------
KERNEL: rotate_v2
@@ -1251,7 +1251,7 @@ put those instructions into the schedule.
---------------------------------------------------------------------------
...
---------------------------------------------------------------------------
- SCHEDULE:
+ LINEARIZATION:
0: CALL KERNEL rotate_v2(extra_args=['tmp_save_slot'], extra_inames=[])
1: tmp = arr[i_inner + i_outer*16] {id=maketmp}
2: tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] = tmp {id=tmp.save}
diff --git a/examples/fortran/matmul-driver.py b/examples/fortran/matmul-driver.py
new file mode 100644
index 0000000000000000000000000000000000000000..9db569480d521e58210c030e742386cd12dc8d37
--- /dev/null
+++ b/examples/fortran/matmul-driver.py
@@ -0,0 +1,35 @@
+import numpy as np
+import numpy.linalg as la
+import pyopencl as cl
+import pyopencl.array
+import pyopencl.clrandom
+import loopy as lp
+
+
+def main():
+ fn = "matmul.floopy"
+ with open(fn, "r") as inf:
+ source = inf.read()
+
+ dgemm, = lp.parse_transformed_fortran(source, filename=fn)
+
+ ctx = cl.create_some_context()
+ queue = cl.CommandQueue(ctx)
+
+ n = 2048
+ a = cl.array.empty(queue, (n, n), dtype=np.float64, order="F")
+ b = cl.array.empty(queue, (n, n), dtype=np.float64, order="F")
+ c = cl.array.zeros(queue, (n, n), dtype=np.float64, order="F")
+ cl.clrandom.fill_rand(a)
+ cl.clrandom.fill_rand(b)
+
+ dgemm = lp.set_options(dgemm, write_code=True)
+
+ dgemm(queue, a=a, b=b, alpha=1, c=c)
+
+ c_ref = (a.get() @ b.get())
+ assert la.norm(c_ref - c.get())/la.norm(c_ref) < 1e-10
+
+
+if __name__ == "__main__":
+ main()
diff --git a/loopy/__init__.py b/loopy/__init__.py
index b60de6e2dcd35c1c167bf5e303401f2c6242ebec..807ce88341a8845a154d853077aea649c0938064 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -123,12 +123,12 @@ from loopy.transform.add_barrier import add_barrier
from loopy.type_inference import infer_unknown_types
from loopy.preprocess import preprocess_kernel, realize_reduction
-from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
-from loopy.statistics import (ToCountMap, CountGranularity, stringify_stats_mapping,
- Op, MemAccess, get_op_poly, get_op_map, get_lmem_access_poly,
- get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_map,
- get_synchronization_poly, get_synchronization_map,
- gather_access_footprints, gather_access_footprint_bytes)
+from loopy.schedule import (
+ generate_loop_schedules, get_one_scheduled_kernel, get_one_linearized_kernel)
+from loopy.statistics import (ToCountMap, CountGranularity,
+ stringify_stats_mapping, Op, MemAccess, get_op_map, get_mem_access_map,
+ get_synchronization_map, gather_access_footprints,
+ gather_access_footprint_bytes)
from loopy.codegen import (
PreambleInfo,
generate_code, generate_code_v2, generate_body)
@@ -248,16 +248,16 @@ __all__ = [
"infer_unknown_types",
"preprocess_kernel", "realize_reduction",
- "generate_loop_schedules", "get_one_scheduled_kernel",
+ "generate_loop_schedules",
+ "get_one_scheduled_kernel", "get_one_linearized_kernel",
"GeneratedProgram", "CodeGenerationResult",
"PreambleInfo",
"generate_code", "generate_code_v2", "generate_body",
"ToCountMap", "CountGranularity", "stringify_stats_mapping", "Op",
- "MemAccess", "get_op_poly", "get_op_map", "get_lmem_access_poly",
- "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_map",
- "get_synchronization_poly", "get_synchronization_map",
- "gather_access_footprints", "gather_access_footprint_bytes",
+ "MemAccess", "get_op_map", "get_mem_access_map",
+ "get_synchronization_map", "gather_access_footprints",
+ "gather_access_footprint_bytes",
"CompiledKernel",
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index 6837b99a026debf32b12aceef00ed3863c620639..ca70c8489238ee6f1fd95f52b02dbe451ddf13ef 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -534,7 +534,7 @@ def auto_test_vs_ref(
from loopy.target.pyopencl import PyOpenCLTarget
if test_knl.state not in [
KernelState.PREPROCESSED,
- KernelState.SCHEDULED]:
+ KernelState.LINEARIZED]:
if isinstance(test_knl.target, PyOpenCLTarget):
test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0]))
diff --git a/loopy/check.py b/loopy/check.py
index cc87ad9872668bf5323aefd79944e3bbd71b1153..da49c1d116df1a9fbf92e8ef41822b6741405604 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -184,6 +184,19 @@ def check_for_inactive_iname_access(kernel):
", ".join(expression_inames - kernel.insn_inames(insn))))
+def check_for_unused_inames(kernel):
+ # Warn if kernel has unused inames
+ from loopy.transform.iname import get_used_inames
+ unused_inames = kernel.all_inames() - get_used_inames(kernel)
+ if unused_inames:
+ warn_with_kernel(
+ kernel, "unused_inames",
+ "Found unused inames in kernel: %s "
+ "Unused inames during linearization will be prohibited in "
+ "Loopy version 2021.X."
+ % unused_inames)
+
+
def _is_racing_iname_tag(tv, tag):
from loopy.kernel.data import (AddressSpace,
LocalIndexTagBase, GroupIndexTag, ConcurrentTag, auto)
@@ -220,12 +233,12 @@ def check_for_write_races(kernel):
assignee_inames = assignee_indices & kernel.all_inames()
if not assignee_inames <= kernel.insn_inames(insn):
raise LoopyError(
- "assignee of instructiosn '%s' references "
+ "assignee of instructions '%s' references "
"iname that the instruction does not depend on"
% insn.id)
if assignee_name in kernel.arg_dict:
- # Any parallel tags that are not depended upon by the assignee
+ # Any concurrent tags that are not depended upon by the assignee
# will cause write races.
raceable_parallel_insn_inames = set(
@@ -658,6 +671,7 @@ def pre_schedule_checks(kernel):
check_loop_priority_inames_known(kernel)
check_multiple_tags_allowed(kernel)
check_for_inactive_iname_access(kernel)
+ check_for_unused_inames(kernel)
check_for_write_races(kernel)
check_for_data_dependent_parallel_bounds(kernel)
check_bounds(kernel)
diff --git a/loopy/cli.py b/loopy/cli.py
index a92922b1845d76dd7a700a93c05de3eecf8c28dd..cdc24800be0edf3935aacccdd4dc4d9905cf5965 100644
--- a/loopy/cli.py
+++ b/loopy/cli.py
@@ -39,13 +39,13 @@ def defines_to_python_code(defines_str):
import re
define_re = re.compile(r"^\#define\s+([a-zA-Z0-9_]+)\s+(.*)$")
result = []
- for l in defines_str.split("\n"):
- if not l.strip():
+ for line in defines_str.split("\n"):
+ if not line.strip():
continue
- match = define_re.match(l)
+ match = define_re.match(line)
if match is None:
- raise RuntimeError("#define not understood: '%s'" % l)
+ raise RuntimeError("#define not understood: '%s'" % line)
result.append(
"%s = %s" % (match.group(1), to_python_literal(match.group(2))))
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 11f874e1bd90bcfc1fe4595345c1b1efb2e6a35f..b4811dc9966921fa612aabef9a726d6b53fd4052 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -388,7 +388,7 @@ def generate_code_v2(kernel):
from loopy.schedule import get_one_scheduled_kernel
kernel = get_one_scheduled_kernel(kernel)
- if kernel.state != KernelState.SCHEDULED:
+ if kernel.state != KernelState.LINEARIZED:
raise LoopyError("cannot generate code for a kernel that has not been "
"scheduled")
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index c946e09a086e574a2593d60f652a81773d95a1fe..b736191ec1dadb842e12453fbec3b68e831338f6 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -59,6 +59,7 @@ def get_usable_inames_for_conditional(kernel, sched_index):
from loopy.schedule import (
find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within)
from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase,
+ VectorizeTag,
IlpBaseTag)
result = find_active_inames_at(kernel, sched_index)
@@ -67,7 +68,7 @@ def get_usable_inames_for_conditional(kernel, sched_index):
# Find our containing subkernel. Grab inames for all insns from there.
within_subkernel = False
- for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index+1]):
+ for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index]):
from loopy.schedule import CallKernel, ReturnFromKernel
if isinstance(sched_item, CallKernel):
within_subkernel = True
@@ -92,11 +93,12 @@ def get_usable_inames_for_conditional(kernel, sched_index):
#
# - local indices may not be used in conditionals that cross barriers.
#
- # - ILP indices are not available in loop bounds, they only get defined
- # at the innermost level of nesting.
+ # - ILP indices and vector lane indices are not available in loop
+ # bounds, they only get defined at the innermost level of nesting.
if (
kernel.iname_tags_of_type(iname, ConcurrentTag)
+ and not kernel.iname_tags_of_type(iname, VectorizeTag)
and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase)
and crosses_barrier)
and not kernel.iname_tags_of_type(iname, IlpBaseTag)
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index e1520a82ed69fa2aed729d9b1d849a78d658c4e1..7319b16ac2fe9f39872558a3878161b89cab15d9 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -24,7 +24,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
-import six
from loopy.codegen.result import merge_codegen_results, wrap_in_if
import islpy as isl
from loopy.schedule import (
@@ -33,30 +32,6 @@ from loopy.schedule import (
from loopy.diagnostic import LoopyError
-def get_admissible_conditional_inames_for(codegen_state, sched_index):
- """This function disallows conditionals on local-idx tagged
- inames if there is a barrier nested somewhere within.
- """
-
- kernel = codegen_state.kernel
-
- from loopy.kernel.data import (LocalIndexTag, HardwareConcurrentTag,
- filter_iname_tags_by_type)
-
- from loopy.schedule import find_active_inames_at, has_barrier_within
- result = find_active_inames_at(kernel, sched_index)
-
- has_barrier = has_barrier_within(kernel, sched_index)
-
- for iname, tags in six.iteritems(kernel.iname_to_tags):
- if (filter_iname_tags_by_type(tags, HardwareConcurrentTag)
- and codegen_state.is_generating_device_code):
- if not has_barrier or not filter_iname_tags_by_type(tags, LocalIndexTag):
- result.add(iname)
-
- return frozenset(result)
-
-
def synthesize_idis_for_extra_args(kernel, schedule_index):
"""
:returns: A list of :class:`loopy.codegen.ImplementedDataInfo`
@@ -222,14 +197,14 @@ def get_required_predicates(kernel, sched_index):
return result
-def group_by(l, key, merge):
- if not l:
- return l
+def group_by(entry, key, merge):
+ if not entry:
+ return entry
result = []
- previous = l[0]
+ previous = entry[0]
- for item in l[1:]:
+ for item in entry[1:]:
if key(previous) == key(item):
previous = merge(previous, item)
@@ -302,11 +277,13 @@ def build_loop_nest(codegen_state, schedule_index):
"""
from loopy.schedule import find_used_inames_within
+ from loopy.codegen.bounds import get_usable_inames_for_conditional
+
sched_index_info_entries = [
ScheduleIndexInfo(
schedule_indices=[i],
admissible_cond_inames=(
- get_admissible_conditional_inames_for(codegen_state, i)),
+ get_usable_inames_for_conditional(kernel, i)),
required_predicates=get_required_predicates(kernel, i),
used_inames_within=find_used_inames_within(kernel, i)
)
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index 5e0747246160ddc2934c3d545c03a2a9b4090d5d..c0ca875c0e9b661becb1bb0ca6e81139a8a93e2d 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -274,7 +274,7 @@ def generate_c_instruction_code(codegen_state, insn):
if body:
body.append(Line())
- body.extend(Line(l) for l in insn.code.split("\n"))
+ body.extend(Line(line) for line in insn.code.split("\n"))
return Block(body)
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 128e4fbc85a2a03e25da3f88b200e67eb41756d3..b3a87798840bb1624d350c79830f29142e54ab6c 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -231,7 +231,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
kernel = codegen_state.kernel
from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag,
- LocalIndexTag, GroupIndexTag)
+ LocalIndexTag, GroupIndexTag, VectorizeTag)
from loopy.schedule import get_insn_ids_for_block_at
insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index)
@@ -242,7 +242,8 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
all_inames_by_insns |= kernel.insn_inames(insn_id)
hw_inames_left = [iname for iname in all_inames_by_insns
- if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)]
+ if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)
+ and not kernel.iname_tags_of_type(iname, VectorizeTag)]
if not hw_inames_left:
return next_func(codegen_state)
diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py
index 05b0a92050a51be1cd980648325921fbf13768d8..40202d4da3319c0ef24b0317f01cd4d31f88d484 100644
--- a/loopy/frontend/fortran/__init__.py
+++ b/loopy/frontend/fortran/__init__.py
@@ -86,17 +86,17 @@ def _extract_loopy_lines(source):
loopy_lines = []
in_loopy_code = False
- for l in lines:
- comment_match = comment_re.match(l)
+ for line in lines:
+ comment_match = comment_re.match(line)
if comment_match is None:
if in_loopy_code:
raise LoopyError("non-comment source line in loopy block")
- remaining_lines.append(l)
+ remaining_lines.append(line)
# Preserves line numbers in loopy code, for debuggability
- loopy_lines.append("# "+l)
+ loopy_lines.append("# "+line)
continue
cmt = comment_match.group(1)
@@ -108,7 +108,7 @@ def _extract_loopy_lines(source):
in_loopy_code = True
# Preserves line numbers in loopy code, for debuggability
- loopy_lines.append("# "+l)
+ loopy_lines.append("# "+line)
elif cmt_stripped == "$loopy end":
if not in_loopy_code:
@@ -116,16 +116,16 @@ def _extract_loopy_lines(source):
in_loopy_code = False
# Preserves line numbers in loopy code, for debuggability
- loopy_lines.append("# "+l)
+ loopy_lines.append("# "+line)
elif in_loopy_code:
loopy_lines.append(cmt)
else:
- remaining_lines.append(l)
+ remaining_lines.append(line)
# Preserves line numbers in loopy code, for debuggability
- loopy_lines.append("# "+l)
+ loopy_lines.append("# "+line)
return "\n".join(remaining_lines), "\n".join(loopy_lines)
diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py
index d7a1b2498af583bc9ff97ba743ccc5ed8bd25d3a..91a5fdc88f02a99c6064f6b9944b08de662a27a8 100644
--- a/loopy/frontend/fortran/translator.py
+++ b/loopy/frontend/fortran/translator.py
@@ -339,11 +339,11 @@ class F2LoopyTranslator(FTreeWalkerBase):
return []
- map_Logical = map_type_decl
- map_Integer = map_type_decl
- map_Real = map_type_decl
- map_Complex = map_type_decl
- map_DoublePrecision = map_type_decl
+ map_Logical = map_type_decl # noqa: N815
+ map_Integer = map_type_decl # noqa: N815
+ map_Real = map_type_decl # noqa: N815
+ map_Complex = map_type_decl # noqa: N815
+ map_DoublePrecision = map_type_decl # noqa: N815
def map_Dimension(self, node):
scope = self.scope_stack[-1]
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 80a7ad03101bc67f39c89c6089aa6533d1886185..2d926aad4faa511aa2919630c9b0e96b7f253ad9 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -35,14 +35,13 @@ import islpy as isl
from islpy import dim_type
import re
-from pytools import UniqueNameGenerator, generate_unique_names
+from pytools import UniqueNameGenerator, generate_unique_names, natsorted
from loopy.library.function import (
default_function_mangler,
single_arg_function_mangler)
from loopy.diagnostic import CannotBranchDomainTree, LoopyError
-from loopy.tools import natsorted
from loopy.diagnostic import StaticValueFindingError
from loopy.kernel.data import filter_iname_tags_by_type
from warnings import warn
@@ -99,10 +98,25 @@ class _UniqueVarNameGenerator(UniqueNameGenerator):
# {{{ loop kernel object
+class _deprecated_KernelState_SCHEDULED(object): # noqa
+ def __init__(self, f):
+ self.f = f
+
+ def __get__(self, obj, klass):
+ warn(
+ "'KernelState.SCHEDULED' is deprecated. "
+ "Use 'KernelState.LINEARIZED'.",
+ DeprecationWarning, stacklevel=2)
+ return self.f()
+
class KernelState: # noqa
INITIAL = 0
PREPROCESSED = 1
- SCHEDULED = 2
+ LINEARIZED = 2
+
+ @_deprecated_KernelState_SCHEDULED
+ def SCHEDULED(): # pylint:disable=no-method-argument
+ return KernelState.LINEARIZED
# {{{ kernel_state, KernelState compataibility
@@ -228,7 +242,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
# {{{ constructor
- def __init__(self, domains, instructions, args=None, schedule=None,
+ def __init__(self, domains, instructions, args=None,
+ schedule=None,
+ linearization=None,
name="loopy_kernel",
preambles=None,
preamble_generators=None,
@@ -333,10 +349,27 @@ class LoopKernel(ImmutableRecordWithoutPickling):
if state not in [
KernelState.INITIAL,
KernelState.PREPROCESSED,
- KernelState.SCHEDULED,
+ KernelState.LINEARIZED,
]:
raise ValueError("invalid value for 'state'")
+ # `linearization` is replacing `schedule`, but we're not changing
+ # this under the hood yet, so for now, store it inside `schedule`
+ # and raise deprecation warning anyway
+ if schedule is not None:
+ if linearization is not None:
+ # these should not both be present
+ raise ValueError(
+ "received both `schedule` and `linearization` args, "
+ "'LoopKernel.schedule' is deprecated. "
+ "Use 'LoopKernel.linearization'.")
+ warn(
+ "'LoopKernel.schedule' is deprecated. "
+ "Use 'LoopKernel.linearization'.",
+ DeprecationWarning, stacklevel=2)
+ elif linearization is not None:
+ schedule = linearization
+
from collections import defaultdict
assert not isinstance(iname_to_tags, defaultdict)
@@ -1345,7 +1378,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
if "schedule" in what and kernel.schedule is not None:
lines.extend(sep)
if show_labels:
- lines.append("SCHEDULE:")
+ lines.append("LINEARIZATION:")
from loopy.schedule import dump_schedule
lines.append(dump_schedule(kernel, kernel.schedule))
@@ -1395,6 +1428,14 @@ class LoopKernel(ImmutableRecordWithoutPickling):
# }}}
+ # {{{ handle linearization variable that doesn't yet exist
+
+ @property
+ def linearization(self):
+ return self.schedule
+
+ # }}}
+
# {{{ direct execution
def __call__(self, *args, **kwargs):
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 9ac38fc87a27da13e98515085edd6f2e35b1fcd7..e6544b34a55af97a1a15e86f7d74855e08e53116 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -186,7 +186,7 @@ class LoopedIlpTag(IlpBaseTag):
# }}}
-class VectorizeTag(UniqueTag):
+class VectorizeTag(UniqueTag, HardwareConcurrentTag):
def __str__(self):
return "vec"
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index 8213c9584b54917050c586e1b83b6d66d0473798..61127232a9f494fe2fdc536dd50d8fdf41b8f17c 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -66,7 +66,8 @@ class InstructionBase(ImmutableRecord):
.. attribute:: depends_on_is_final
A :class:`bool` determining whether :attr:`depends_on` constitutes
- the *entire* list of iname dependencies.
+ the *entire* list of iname dependencies. If *not* marked final,
+ various semi-broken heuristics will try to add further dependencies.
Defaults to *False*.
@@ -344,10 +345,13 @@ class InstructionBase(ImmutableRecord):
"""
raise NotImplementedError
- def with_transformed_expressions(self, f, *args):
+ def with_transformed_expressions(self, f, assignee_f=None):
"""Return a new copy of *self* where *f* has been applied to every
expression occurring in *self*. *args* will be passed as extra
arguments (in addition to the expression) to *f*.
+
+ If *assignee_f* is passed, then left-hand sides of assignments are
+ passed to it. If it is not given, it defaults to the same as *f*.
"""
raise NotImplementedError
@@ -959,12 +963,15 @@ class Assignment(MultiAssignmentBase):
def assignee_subscript_deps(self):
return (_get_assignee_subscript_deps(self.assignee),)
- def with_transformed_expressions(self, f, *args):
+ def with_transformed_expressions(self, f, assignee_f=None):
+ if assignee_f is None:
+ assignee_f = f
+
return self.copy(
- assignee=f(self.assignee, *args),
- expression=f(self.expression, *args),
+ assignee=assignee_f(self.assignee),
+ expression=f(self.expression),
predicates=frozenset(
- f(pred, *args) for pred in self.predicates))
+ f(pred) for pred in self.predicates))
# }}}
@@ -1114,12 +1121,15 @@ class CallInstruction(MultiAssignmentBase):
_get_assignee_subscript_deps(a)
for a in self.assignees)
- def with_transformed_expressions(self, f, *args):
+ def with_transformed_expressions(self, f, assignee_f=None):
+ if assignee_f is None:
+ assignee_f = f
+
return self.copy(
- assignees=f(self.assignees, *args),
- expression=f(self.expression, *args),
+ assignees=assignee_f(self.assignees),
+ expression=f(self.expression),
predicates=frozenset(
- f(pred, *args) for pred in self.predicates))
+ f(pred) for pred in self.predicates))
# }}}
@@ -1315,14 +1325,17 @@ class CInstruction(InstructionBase):
_get_assignee_subscript_deps(a)
for a in self.assignees)
- def with_transformed_expressions(self, f, *args):
+ def with_transformed_expressions(self, f, assignee_f=None):
+ if assignee_f is None:
+ assignee_f = f
+
return self.copy(
iname_exprs=[
- (name, f(expr, *args))
+ (name, f(expr))
for name, expr in self.iname_exprs],
- assignees=[f(a, *args) for a in self.assignees],
+ assignees=[assignee_f(a) for a in self.assignees],
predicates=frozenset(
- f(pred, *args) for pred in self.predicates))
+ f(pred) for pred in self.predicates))
# }}}
@@ -1357,7 +1370,7 @@ class _DataObliviousInstruction(InstructionBase):
def assignee_subscript_deps(self):
return frozenset()
- def with_transformed_expressions(self, f, *args):
+ def with_transformed_expressions(self, f, assignee_f=None):
return self.copy(
predicates=frozenset(
f(pred) for pred in self.predicates))
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 157099df5a2133baa109f24e8216d63577b5dcb4..e33d260fba4f3f4122f35e033ecc573b41999d5d 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -34,8 +34,7 @@ import numpy as np
import islpy as isl
from islpy import dim_type
from loopy.diagnostic import LoopyError, warn_with_kernel
-from pytools import memoize_on_first_arg
-from loopy.tools import natsorted
+from pytools import memoize_on_first_arg, natsorted
import logging
logger = logging.getLogger(__name__)
@@ -1381,7 +1380,7 @@ def draw_dependencies_as_unicode_arrows(
.replace(style.RESET_ALL, ""))
return len(s)
- def truncate_without_color_escapes(s, l):
+ def truncate_without_color_escapes(s, length):
# FIXME: This is a bit dumb--it removes color escapes when truncation
# is needed.
@@ -1389,7 +1388,7 @@ def draw_dependencies_as_unicode_arrows(
.replace(fore.RED, "")
.replace(style.RESET_ALL, ""))
- return s[:l] + u"…"
+ return s[:length] + u"…"
def conform_to_uniform_length(s):
len_s = len_without_color_escapes(s)
@@ -1428,6 +1427,8 @@ def stringify_instruction_list(kernel):
def insert_insn_into_order(insn):
if insn.id in printed_insn_ids:
+ # Note: dependency cycles are deliberately ignored so that printing
+ # succeeds.
return
printed_insn_ids.add(insn.id)
@@ -1511,7 +1512,7 @@ def stringify_instruction_list(kernel):
", ".join("%s=%s" % (name, expr)
for name, expr in insn.iname_exprs))
- trailing = [l for l in insn.code.split("\n")]
+ trailing = insn.code.split("\n")
elif isinstance(insn, lp.BarrierInstruction):
lhs = ""
rhs = "... %sbarrier" % insn.synchronization_kind[0]
@@ -1583,6 +1584,13 @@ def stringify_instruction_list(kernel):
# {{{ global barrier order finding
+def _is_global_barrier(kernel, insn_id):
+ insn = kernel.id_to_insn[insn_id]
+ from loopy.kernel.instruction import BarrierInstruction
+ return isinstance(insn, BarrierInstruction) and \
+ insn.synchronization_kind == "global"
+
+
@memoize_on_first_arg
def get_global_barrier_order(kernel):
"""Return a :class:`tuple` of the listing the ids of global barrier instructions
@@ -1590,49 +1598,27 @@ def get_global_barrier_order(kernel):
See also :class:`loopy.instruction.BarrierInstruction`.
"""
- barriers = []
- visiting = set()
- visited = set()
-
- unvisited = set(insn.id for insn in kernel.instructions)
-
- def is_barrier(my_insn_id):
- insn = kernel.id_to_insn[my_insn_id]
- from loopy.kernel.instruction import BarrierInstruction
- return isinstance(insn, BarrierInstruction) and \
- insn.synchronization_kind == "global"
-
- while unvisited:
- stack = [unvisited.pop()]
-
- while stack:
- top = stack[-1]
-
- if top in visiting:
- visiting.remove(top)
- if is_barrier(top):
- barriers.append(top)
+ dep_graph = {insn.id: set() for insn in kernel.instructions}
+ for insn in kernel.instructions:
+ for dep in insn.depends_on:
+ dep_graph[dep].add(insn.id)
- if top in visited:
- stack.pop()
- continue
+ from pytools.graph import compute_topological_order
+ order = compute_topological_order(dep_graph)
- visited.add(top)
- visiting.add(top)
+ barriers = [
+ insn_id for insn_id in order
+ if _is_global_barrier(kernel, insn_id)]
- for child in kernel.id_to_insn[top].depends_on:
- # Check for no cycles.
- assert child not in visiting
- stack.append(child)
+ del order
# Ensure this is the only possible order.
#
# We do this by looking at the barriers in order.
# We check for each adjacent pair (a,b) in the order if a < b,
# i.e. if a is reachable by a chain of dependencies from b.
-
- visiting.clear()
- visited.clear()
+ visited = set()
+ visiting = set()
for prev_barrier, barrier in zip(barriers, barriers[1:]):
# Check if prev_barrier is reachable from barrier.
@@ -1690,12 +1676,6 @@ def find_most_recent_global_barrier(kernel, insn_id):
if len(insn.depends_on) == 0:
return None
- def is_barrier(my_insn_id):
- insn = kernel.id_to_insn[my_insn_id]
- from loopy.kernel.instruction import BarrierInstruction
- return isinstance(insn, BarrierInstruction) and \
- insn.synchronization_kind == "global"
-
global_barrier_to_ordinal = dict(
(b, i) for i, b in enumerate(global_barrier_order))
@@ -1705,7 +1685,7 @@ def find_most_recent_global_barrier(kernel, insn_id):
else -1)
direct_barrier_dependencies = set(
- dep for dep in insn.depends_on if is_barrier(dep))
+ dep for dep in insn.depends_on if _is_global_barrier(kernel, dep))
if len(direct_barrier_dependencies) > 0:
return max(direct_barrier_dependencies, key=get_barrier_ordinal)
@@ -1727,8 +1707,8 @@ def get_subkernels(kernel):
See also :class:`loopy.schedule.CallKernel`.
"""
from loopy.kernel import KernelState
- if kernel.state != KernelState.SCHEDULED:
- raise LoopyError("Kernel must be scheduled")
+ if kernel.state != KernelState.LINEARIZED:
+ raise LoopyError("Kernel must be linearized")
from loopy.schedule import CallKernel
@@ -1744,7 +1724,7 @@ def get_subkernel_to_insn_id_map(kernel):
kernel must be scheduled.
"""
from loopy.kernel import KernelState
- if kernel.state != KernelState.SCHEDULED:
+ if kernel.state != KernelState.LINEARIZED:
raise LoopyError("Kernel must be scheduled")
from loopy.schedule import (
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index c0eb91ea60317ef8cad1c594571d46bba2d1a671..de81815a82655136941b57b1f78486aed39237da 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -37,6 +37,7 @@ from loopy.version import DATA_MODEL_VERSION
from loopy.kernel.data import make_assignment, filter_iname_tags_by_type
# for the benefit of loopy.statistics, for now
from loopy.type_inference import infer_unknown_types
+from loopy.transform.iname import remove_any_newly_unused_inames
import logging
logger = logging.getLogger(__name__)
@@ -289,7 +290,7 @@ def _classify_reduction_inames(kernel, inames):
nonlocal_par = []
from loopy.kernel.data import (
- LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
+ LocalIndexTagBase, UnrolledIlpTag, UnrollTag,
ConcurrentTag, filter_iname_tags_by_type)
for iname in inames:
@@ -303,7 +304,7 @@ def _classify_reduction_inames(kernel, inames):
elif filter_iname_tags_by_type(iname_tags, LocalIndexTagBase):
local_par.append(iname)
- elif filter_iname_tags_by_type(iname_tags, (ConcurrentTag, VectorizeTag)):
+ elif filter_iname_tags_by_type(iname_tags, ConcurrentTag):
nonlocal_par.append(iname)
else:
@@ -882,6 +883,7 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain):
# }}}
+@remove_any_newly_unused_inames
def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
automagic_scans_ok=False, force_scan=False,
force_outer_iname_for_scan=None):
@@ -1370,7 +1372,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
track_iname = var_name_gen(
"{sweep_iname}__seq_scan"
- .format(scan_iname=scan_iname, sweep_iname=sweep_iname))
+ .format(sweep_iname=sweep_iname))
get_or_add_sweep_tracking_iname_and_domain(
scan_iname, sweep_iname, sweep_min_value, scan_min_value,
@@ -1480,7 +1482,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
track_iname = var_name_gen(
"{sweep_iname}__pre_scan"
- .format(scan_iname=scan_iname, sweep_iname=sweep_iname))
+ .format(sweep_iname=sweep_iname))
get_or_add_sweep_tracking_iname_and_domain(
scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride,
@@ -1924,8 +1926,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
kernel = lp.tag_inames(kernel, new_iname_tags)
- # TODO: remove unused inames...
-
kernel = (
_hackily_ensure_multi_assignment_return_values_are_scoped_private(
kernel))
@@ -1979,7 +1979,7 @@ def find_idempotence(kernel):
# Find SCCs of dep_graph. These are used for checking if the instruction is
# in a dependency cycle.
- from loopy.tools import compute_sccs
+ from pytools.graph import compute_sccs
sccs = dict((item, scc)
for scc in compute_sccs(dep_graph)
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index fb0d0e2c17005ecf051d7034fd7903ed5262bdfc..032cdc2760597f1fa6f701a8a88252312deac797 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -212,12 +212,12 @@ def find_loop_nest_with_map(kernel):
"""
result = {}
- from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag
+ from loopy.kernel.data import ConcurrentTag, IlpBaseTag
all_nonpar_inames = set(
iname for iname in kernel.all_inames()
if not kernel.iname_tags_of_type(iname,
- (ConcurrentTag, IlpBaseTag, VectorizeTag)))
+ (ConcurrentTag, IlpBaseTag)))
iname_to_insns = kernel.iname_to_insns()
@@ -276,7 +276,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):
result = {}
- from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag
+ from loopy.kernel.data import ConcurrentTag, IlpBaseTag
for insn in kernel.instructions:
for iname in kernel.insn_inames(insn):
if kernel.iname_tags_of_type(iname, ConcurrentTag):
@@ -310,7 +310,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):
continue
if kernel.iname_tags_of_type(dep_insn_iname,
- (ConcurrentTag, IlpBaseTag, VectorizeTag)):
+ (ConcurrentTag, IlpBaseTag)):
# Parallel tags don't really nest, so we'll disregard
# them here.
continue
@@ -1841,7 +1841,7 @@ def generate_loop_schedules(kernel, debug_args={}):
def generate_loop_schedules_inner(kernel, debug_args={}):
from loopy.kernel import KernelState
- if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED):
+ if kernel.state not in (KernelState.PREPROCESSED, KernelState.LINEARIZED):
raise LoopyError("cannot schedule a kernel that has not been "
"preprocessed")
@@ -1852,7 +1852,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
debug = ScheduleDebugger(**debug_args)
- preschedule = kernel.schedule if kernel.state == KernelState.SCHEDULED else ()
+ preschedule = kernel.schedule if kernel.state == KernelState.LINEARIZED else ()
prescheduled_inames = set(
insn.iname
@@ -1904,7 +1904,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
unscheduled_insn_ids=set(insn.id for insn in kernel.instructions),
scheduled_insn_ids=frozenset(),
- within_subkernel=kernel.state != KernelState.SCHEDULED,
+ within_subkernel=kernel.state != KernelState.LINEARIZED,
may_schedule_global_barriers=True,
preschedule=preschedule,
@@ -1973,11 +1973,11 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
new_kernel = kernel.copy(
schedule=gen_sched,
- state=KernelState.SCHEDULED)
+ state=KernelState.LINEARIZED)
from loopy.schedule.device_mapping import \
map_schedule_onto_host_or_device
- if kernel.state != KernelState.SCHEDULED:
+ if kernel.state != KernelState.LINEARIZED:
# Device mapper only gets run once.
new_kernel = map_schedule_onto_host_or_device(new_kernel)
@@ -2029,6 +2029,15 @@ def _get_one_scheduled_kernel_inner(kernel):
def get_one_scheduled_kernel(kernel):
+ warn_with_kernel(
+ kernel, "get_one_scheduled_kernel_deprecated",
+ "get_one_scheduled_kernel is deprecated. "
+ "Use get_one_linearized_kernel instead.",
+ DeprecationWarning)
+ return get_one_linearized_kernel(kernel)
+
+
+def get_one_linearized_kernel(kernel):
from loopy import CACHING_ENABLED
sched_cache_key = kernel
diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py
index 59afb07d2e9b7713dbe86c2c5aef7356decbbcff..d45c1ecbdc7ea091ce7d1a3899e82c14bb6fef2b 100644
--- a/loopy/schedule/device_mapping.py
+++ b/loopy/schedule/device_mapping.py
@@ -31,7 +31,7 @@ from loopy.schedule.tools import get_block_boundaries
def map_schedule_onto_host_or_device(kernel):
# FIXME: Should be idempotent.
from loopy.kernel import KernelState
- assert kernel.state == KernelState.SCHEDULED
+ assert kernel.state == KernelState.LINEARIZED
from functools import partial
device_prog_name_gen = partial(
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 10d29daad062744ca3fbe2dc2261be4cd2c4ca99..32fe7741e1298c99e2baf74f3e08e67fc8b2a63e 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1863,75 +1863,4 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False):
# }}}
-
-# {{{ compat goop
-
-def get_lmem_access_poly(knl):
- """Count the number of local memory accesses in a loopy kernel.
-
- get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the
- result with the mtype=['local'] option.
-
- """
- warn_with_kernel(knl, "deprecated_get_lmem_access_poly",
- "get_lmem_access_poly is deprecated. Use "
- "get_mem_access_map and filter the result with the "
- "mtype=['local'] option.")
- return get_mem_access_map(knl).filter_by(mtype=['local'])
-
-
-def get_DRAM_access_poly(knl):
- """Count the number of global memory accesses in a loopy kernel.
-
- get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
- result with the mtype=['global'] option.
-
- """
- warn_with_kernel(knl, "deprecated_get_DRAM_access_poly",
- "get_DRAM_access_poly is deprecated. Use "
- "get_mem_access_map and filter the result with the "
- "mtype=['global'] option.")
- return get_mem_access_map(knl).filter_by(mtype=['global'])
-
-
-def get_gmem_access_poly(knl):
- """Count the number of global memory accesses in a loopy kernel.
-
- get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
- result with the mtype=['global'] option.
-
- """
- warn_with_kernel(knl, "deprecated_get_gmem_access_poly",
- "get_DRAM_access_poly is deprecated. Use "
- "get_mem_access_map and filter the result with the "
- "mtype=['global'] option.")
- return get_mem_access_map(knl).filter_by(mtype=['global'])
-
-
-def get_synchronization_poly(knl):
- """Count the number of synchronization events each work-item encounters in
- a loopy kernel.
-
- get_synchronization_poly is deprecated. Use get_synchronization_map
- instead.
-
- """
- warn_with_kernel(knl, "deprecated_get_synchronization_poly",
- "get_synchronization_poly is deprecated. Use "
- "get_synchronization_map instead.")
- return get_synchronization_map(knl)
-
-
-def get_op_poly(knl, numpy_types=True):
- """Count the number of operations in a loopy kernel.
-
- get_op_poly is deprecated. Use get_op_map instead.
-
- """
- warn_with_kernel(knl, "deprecated_get_op_poly",
- "get_op_poly is deprecated. Use get_op_map instead.")
- return get_op_map(knl, numpy_types)
-
-# }}}
-
# vim: foldmethod=marker
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index ccac5e199d2b53e202dd735ffd8dfe20a7dc29a2..4156dfcc1673d176ffb609cf280b28c97cc4949f 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -273,8 +273,7 @@ class UnidirectionalUnifier(UnidirectionalUnifierBase):
if not isinstance(other, type(expr)):
return self.treat_mismatch(expr, other, unis)
if (expr.inames != other.inames
- or type(expr.operation) != type(other.operation) # noqa
- ):
+ or type(expr.operation) != type(other.operation)): # noqa
return []
return self.rec(expr.expr, other.expr, unis)
@@ -971,7 +970,8 @@ class RuleAwareIdentityMapper(IdentityMapper):
# may perform tasks entirely unrelated to subst rules, so
# we must map assignees, too.
self.map_instruction(kernel,
- insn.with_transformed_expressions(self, kernel, insn))
+ insn.with_transformed_expressions(
+ lambda expr: self(expr, kernel, insn)))
for insn in kernel.instructions]
return kernel.copy(instructions=new_insns)
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 6e3602eda11d5f65e8a6af2977966e946c72a718..8869ebecf3e08bf7921d4c9118dd1fda263adb32 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -80,6 +80,11 @@ class DTypeRegistryWrapper(object):
def c99_preamble_generator(preamble_info):
if any(dtype.is_integral() for dtype in preamble_info.seen_dtypes):
yield("10_stdint", "#include ")
+ if any(dtype.numpy_dtype == np.dtype("bool")
+ for dtype in preamble_info.seen_dtypes):
+ yield("10_stdbool", "#include ")
+ if any(dtype.is_complex() for dtype in preamble_info.seen_dtypes):
+ yield("10_complex", "#include ")
def _preamble_generator(preamble_info):
@@ -436,7 +441,7 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True):
arg_dtypes=arg_dtypes)
# binary functions
- if (name in ["fmax", "fmin"]
+ if (name in ["fmax", "fmin", "copysign"]
and len(arg_dtypes) == 2):
dtype = np.find_common_type(
@@ -1079,9 +1084,11 @@ class CTarget(CFamilyTarget):
@memoize_method
def get_dtype_registry(self):
from loopy.target.c.compyte.dtypes import (
- DTypeRegistry, fill_registry_with_c99_stdint_types)
+ DTypeRegistry, fill_registry_with_c99_stdint_types,
+ fill_registry_with_c99_complex_types)
result = DTypeRegistry()
fill_registry_with_c99_stdint_types(result)
+ fill_registry_with_c99_complex_types(result)
return DTypeRegistryWrapper(result)
diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte
index 25ee8b48fd0c7d9f0bd987c6862cdb1884fb1372..7e48e1166a13cfbb7b60f909b071f088034ffda1 160000
--- a/loopy/target/c/compyte
+++ b/loopy/target/c/compyte
@@ -1 +1 @@
-Subproject commit 25ee8b48fd0c7d9f0bd987c6862cdb1884fb1372
+Subproject commit 7e48e1166a13cfbb7b60f909b071f088034ffda1
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index c5e8d0a7f7a9f70b3afe46e9d04a3bf861066329..845e0a4326dbb24e509f98c808a9ce3ac3cb52be 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -1,3 +1,4 @@
+# coding: utf-8
"""OpenCL target integrated with PyOpenCL."""
from __future__ import division, absolute_import
@@ -285,6 +286,9 @@ class PyOpenCLTarget(OpenCLTarget):
warnings) and support for complex numbers.
"""
+ # FIXME make prefixes conform to naming rules
+ # (see Reference: Loopy’s Model of a Kernel)
+
host_program_name_prefix = "_lpy_host_"
host_program_name_suffix = ""
@@ -299,7 +303,26 @@ class PyOpenCLTarget(OpenCLTarget):
self.device = device
self.pyopencl_module_name = pyopencl_module_name
- comparison_fields = ["device"]
+ # NB: Not including 'device', as that is handled specially here.
+ hash_fields = OpenCLTarget.hash_fields + (
+ "pyopencl_module_name",)
+ comparison_fields = OpenCLTarget.comparison_fields + (
+ "pyopencl_module_name",)
+
+ def __eq__(self, other):
+ if not super(PyOpenCLTarget, self).__eq__(other):
+ return False
+
+ if (self.device is None) != (other.device is None):
+ return False
+
+ if self.device is not None:
+ assert other.device is not None
+ return (self.device.persistent_unique_id
+ == other.device.persistent_unique_id)
+ else:
+ assert other.device is None
+ return True
def update_persistent_hash(self, key_hash, key_builder):
super(PyOpenCLTarget, self).update_persistent_hash(key_hash, key_builder)
diff --git a/loopy/tools.py b/loopy/tools.py
index 33b6616f32fb6c5fa6e4517e137ef426a806fb3f..a1cd5e108a45ba60c71b3bb7a51f779b84172065 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -210,11 +210,11 @@ def remove_common_indentation(code, require_leading_newline=True,
test_line = None
if ignore_lines_starting_with:
- for l in lines:
- strip_l = l.lstrip()
+ for line in lines:
+ strip_l = line.lstrip()
if (strip_l
and not strip_l.startswith(ignore_lines_starting_with)):
- test_line = l
+ test_line = line
break
else:
@@ -355,65 +355,6 @@ def empty_aligned(shape, dtype, order='C', n=64):
# }}}
-# {{{ compute SCCs with Tarjan's algorithm
-
-def compute_sccs(graph):
- to_search = set(graph.keys())
- visit_order = {}
- scc_root = {}
- sccs = []
-
- while to_search:
- top = next(iter(to_search))
- call_stack = [(top, iter(graph[top]), None)]
- visit_stack = []
- visiting = set()
-
- scc = []
-
- while call_stack:
- top, children, last_popped_child = call_stack.pop()
-
- if top not in visiting:
- # Unvisited: mark as visited, initialize SCC root.
- count = len(visit_order)
- visit_stack.append(top)
- visit_order[top] = count
- scc_root[top] = count
- visiting.add(top)
- to_search.discard(top)
-
- # Returned from a recursion, update SCC.
- if last_popped_child is not None:
- scc_root[top] = min(
- scc_root[top],
- scc_root[last_popped_child])
-
- for child in children:
- if child not in visit_order:
- # Recurse.
- call_stack.append((top, children, child))
- call_stack.append((child, iter(graph[child]), None))
- break
- if child in visiting:
- scc_root[top] = min(
- scc_root[top],
- visit_order[child])
- else:
- if scc_root[top] == visit_order[top]:
- scc = []
- while visit_stack[-1] != top:
- scc.append(visit_stack.pop())
- scc.append(visit_stack.pop())
- for item in scc:
- visiting.remove(item)
- sccs.append(scc)
-
- return sccs
-
-# }}}
-
-
# {{{ pickled container value
class _PickledObject(object):
@@ -673,20 +614,4 @@ def is_interned(s):
def intern_frozenset_of_ids(fs):
return frozenset(intern(s) for s in fs)
-
-def natorder(key):
- # Return natural ordering for strings, as opposed to dictionary order.
- # E.g. will result in
- # 'abc1' < 'abc9' < 'abc10'
- # rather than
- # 'abc1' < 'abc10' < 'abc9'
- # Based on
- # http://code.activestate.com/recipes/285264-natural-string-sorting/#c7
- import re
- return [int(n) if n else s for n, s in re.findall(r'(\d+)|(\D+)', key)]
-
-
-def natsorted(seq, key=lambda x: x):
- return sorted(seq, key=lambda y: natorder(key(y)))
-
# vim: foldmethod=marker
diff --git a/loopy/transform/data.py b/loopy/transform/data.py
index a6a2d7b4fe4ba94caa8cbe112a5cf90719ceb643..1f0161c06868da4a7c71ba1ebf9eab8ef02eeb3d 100644
--- a/loopy/transform/data.py
+++ b/loopy/transform/data.py
@@ -285,15 +285,15 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
if temporary_name is None:
temporary_name = var_name_gen("%s_fetch" % c_name)
- arg = kernel.arg_dict[var_name]
+ var_descr = kernel.get_var_descriptor(var_name)
# {{{ make parameter names and unification template
parameters = []
- for i in range(arg.num_user_axes()):
+ for i in range(var_descr.num_user_axes()):
based_on = "%s_dim_%d" % (c_name, i)
- if arg.dim_names is not None:
- based_on = "%s_dim_%s" % (c_name, arg.dim_names[i])
+ if var_descr.dim_names is not None:
+ based_on = "%s_dim_%s" % (c_name, var_descr.dim_names[i])
if dim_arg_names is not None and i < len(dim_arg_names):
based_on = dim_arg_names[i]
@@ -322,7 +322,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
kernel, subst_use, sweep_inames, inames_to_be_removed = \
_process_footprint_subscripts(
kernel, rule_name, sweep_inames,
- footprint_subscripts, arg)
+ footprint_subscripts, var_descr)
# Our _not_provided is actually a different object from the one in the
# precompute module, but precompute acutally uses that to adjust its
@@ -331,7 +331,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
from loopy.transform.precompute import precompute
new_kernel = precompute(kernel, subst_use, sweep_inames,
precompute_inames=dim_arg_names,
- default_tag=default_tag, dtype=arg.dtype,
+ default_tag=default_tag, dtype=var_descr.dtype,
fetch_bounding_box=fetch_bounding_box,
temporary_name=temporary_name,
temporary_address_space=temporary_address_space,
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index 96c8252ef7e6622250e9006b2275ef7816700b5c..8432d59ec5b162f6e963abbeae3b2fcabe94cf27 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -977,8 +977,8 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset(
# is inspected. For each element of the power set without the
# empty and the full set, one duplication option is generated.
for insns_to_dup in it.chain.from_iterable(
- it.combinations(iname_insns, l)
- for l in range(1, len(iname_insns))):
+ it.combinations(iname_insns, i)
+ for i in range(1, len(iname_insns))):
yield (
iname,
tuple(insn | old_common_inames for insn in insns_to_dup))
@@ -1184,6 +1184,19 @@ def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None):
# {{{ remove unused inames
+def get_used_inames(knl):
+ import loopy as lp
+ exp_knl = lp.expand_subst(knl)
+
+ used_inames = set()
+ for insn in exp_knl.instructions:
+ used_inames.update(
+ exp_knl.insn_inames(insn.id)
+ | insn.reduction_inames())
+
+ return used_inames
+
+
def remove_unused_inames(knl, inames=None):
"""Delete those among *inames* that are unused, i.e. project them
out of the domain. If these inames pose implicit restrictions on
@@ -1204,17 +1217,7 @@ def remove_unused_inames(knl, inames=None):
# {{{ check which inames are unused
- import loopy as lp
- exp_knl = lp.expand_subst(knl)
-
- inames = set(inames)
- used_inames = set()
- for insn in exp_knl.instructions:
- used_inames.update(
- exp_knl.insn_inames(insn.id)
- | insn.reduction_inames())
-
- unused_inames = inames - used_inames
+ unused_inames = set(inames) - get_used_inames(knl)
# }}}
@@ -1235,6 +1238,33 @@ def remove_unused_inames(knl, inames=None):
return knl
+
+def remove_any_newly_unused_inames(transformation_func):
+ from functools import wraps
+
+ @wraps(transformation_func)
+ def wrapper(knl, *args, **kwargs):
+
+ # check for remove_unused_inames argument, default: True
+ remove_newly_unused_inames = kwargs.pop("remove_newly_unused_inames", True)
+
+ if remove_newly_unused_inames:
+ # determine which inames were already unused
+ inames_already_unused = knl.all_inames() - get_used_inames(knl)
+
+ # call transform
+ transformed_knl = transformation_func(knl, *args, **kwargs)
+
+ # Remove inames that are unused due to transform
+ return remove_unused_inames(
+ transformed_knl,
+ transformed_knl.all_inames()-inames_already_unused)
+ else:
+ # call transform
+ return transformation_func(knl, *args, **kwargs)
+
+ return wrapper
+
# }}}
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index 9f426f76bc6902fd09bd7685c73f187df935be1e..b308836c7727564dbfa9625ad39f378e8034c68c 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -229,7 +229,8 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper):
for insn in kernel.instructions:
self.replaced_something = False
- insn = insn.with_transformed_expressions(self, kernel, insn)
+ insn = insn.with_transformed_expressions(
+ lambda expr: self(expr, kernel, insn))
if self.replaced_something:
insn = insn.copy(
diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py
index b92698ffa1e84455be3f79bed7dbf884f36be490..717a051930e938457dae0ee4441325b3e631d2d9 100644
--- a/loopy/transform/subst.py
+++ b/loopy/transform/subst.py
@@ -25,10 +25,9 @@ THE SOFTWARE.
import six
from loopy.symbolic import (
- get_dependencies, SubstitutionMapper,
RuleAwareIdentityMapper, SubstitutionRuleMappingContext)
from loopy.diagnostic import LoopyError
-from pymbolic.mapper.substitutor import make_subst_func
+from loopy.transform.iname import remove_any_newly_unused_inames
from pytools import ImmutableRecord
from pymbolic import var
@@ -80,40 +79,13 @@ def extract_subst(kernel, subst_name, template, parameters=()):
# }}}
- # {{{ deal with iname deps of template that are not independent_inames
-
- # (We call these 'matching_vars', because they have to match exactly in
- # every CSE. As above, they might need to be renamed to make them unique
- # within the kernel.)
-
- matching_vars = []
- old_to_new = {}
-
- for iname in (get_dependencies(template)
- - set(parameters)
- - kernel.non_iname_variable_names()):
- if iname in kernel.all_inames():
- # need to rename to be unique
- new_iname = var_name_gen(iname)
- old_to_new[iname] = var(new_iname)
- matching_vars.append(new_iname)
- else:
- matching_vars.append(iname)
-
- if old_to_new:
- template = (
- SubstitutionMapper(make_subst_func(old_to_new))
- (template))
-
- # }}}
-
# {{{ gather up expressions
expr_descriptors = []
from loopy.symbolic import UnidirectionalUnifier
unif = UnidirectionalUnifier(
- lhs_mapping_candidates=set(parameters) | set(matching_vars))
+ lhs_mapping_candidates=set(parameters))
def gather_exprs(expr, mapper):
urecs = unif(template, expr)
@@ -177,8 +149,30 @@ def extract_subst(kernel, subst_name, template, parameters=()):
new_insns = []
+ def transform_assignee(expr):
+ # Assignment LHS's cannot be subst rules. Treat them
+ # specially.
+
+ import pymbolic.primitives as prim
+ if isinstance(expr, tuple):
+ return tuple(
+ transform_assignee(expr_i)
+ for expr_i in expr)
+
+ elif isinstance(expr, prim.Subscript):
+ return type(expr)(
+ expr.aggregate,
+ cbmapper(expr.index))
+
+ elif isinstance(expr, prim.Variable):
+ return expr
+ else:
+ raise ValueError("assignment LHS not understood")
+
for insn in kernel.instructions:
- new_insns.append(insn.with_transformed_expressions(cbmapper))
+ new_insns.append(
+ insn.with_transformed_expressions(
+ cbmapper, assignee_f=transform_assignee))
from loopy.kernel.data import SubstitutionRule
new_substs = {
@@ -285,6 +279,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper):
return var(subst_name)(*index)
+@remove_any_newly_unused_inames
def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
force_retain_argument=False):
"""Extract an assignment (to a temporary variable or an argument)
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 010a0658f71bcfcb037a81c6b61fd9417fc98b75..32f039a22a5f8ff076669ecb23f00ad63ed85dd5 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -532,7 +532,7 @@ def infer_unknown_types(kernel, expect_completion=False):
if read_var in names_for_type_inference))
for written_var in names_for_type_inference)
- from loopy.tools import compute_sccs
+ from pytools.graph import compute_sccs
# To speed up processing, we sort the variables by computing the SCCs of the
# type dependency graph. Each SCC represents a set of variables whose types
diff --git a/loopy/version.py b/loopy/version.py
index 29abbc2de889b884de93e5fe39a1d996811c93c9..d69a3b574122622105e4b52c74ec8c595fc816b6 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -42,7 +42,7 @@ else:
# }}}
-VERSION = (2019, 1)
+VERSION = (2020, 1)
VERSION_STATUS = ""
VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
diff --git a/setup.cfg b/setup.cfg
index eec3dfd1f52ed97c58f5281716eac8fc18980094..a0d95746e1a399d6a2d7c315bffc9b834d2f5487 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[flake8]
-ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814,W504
+ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814,N817,W504
max-line-length=85
exclude=
loopy/target/c/compyte/ndarray,
diff --git a/setup.py b/setup.py
index 75d8b340e8ad98794a244f7e5da89e079870bd2b..bba29986997e8e762ad52f38feae6311c4892c10 100644
--- a/setup.py
+++ b/setup.py
@@ -76,10 +76,7 @@ setup(name="loo.py",
'License :: OSI Approved :: MIT License',
'Natural Language :: English',
'Programming Language :: Python',
- 'Programming Language :: Python :: 2.6',
- 'Programming Language :: Python :: 2.7',
- 'Programming Language :: Python :: 3.2',
- 'Programming Language :: Python :: 3.3',
+ 'Programming Language :: Python :: 3',
'Topic :: Scientific/Engineering',
'Topic :: Scientific/Engineering :: Information Analysis',
'Topic :: Scientific/Engineering :: Mathematics',
@@ -89,7 +86,7 @@ setup(name="loo.py",
],
install_requires=[
- "pytools>=2018.4",
+ "pytools>=2020.2",
"pymbolic>=2019.2",
"genpy>=2016.1.2",
"cgen>=2016.1",
diff --git a/test/test_apps.py b/test/test_apps.py
index e07262dbdda8ad3c24522f7d0eb4dba8422bf0ce..71029cc9ce408f8e7fa95eaf3b766864c4beee5b 100644
--- a/test/test_apps.py
+++ b/test/test_apps.py
@@ -566,7 +566,7 @@ def test_poisson_fem(ctx_factory):
sdim = 3
knl = lp.make_kernel(
- "{ [c,i,j,k,ell,ell2,ell3]: \
+ "{ [c,i,j,k,ell,ell2]: \
0 <= c < nels and \
0 <= i < nbf and \
0 <= j < nbf and \
diff --git a/test/test_domain.py b/test/test_domain.py
index ebfde850907d68bebf06076fbf1c87d8bb093f71..5daf84eaa5b7ffd1647daf4b35acd7a5de91c5d1 100644
--- a/test/test_domain.py
+++ b/test/test_domain.py
@@ -111,7 +111,7 @@ def test_eq_constraint(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
- "{[i,j]: 0<= i,j < 32}",
+ "{[i]: 0<= i < 32}",
[
"a[i] = b[i]"
],
diff --git a/test/test_fortran.py b/test/test_fortran.py
index e08033360d403f548d552108e6fd98b9117e19bd..3601e96b752f18e6e01bcfcffe49780bda4058b4 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -275,6 +275,12 @@ def test_tagged(ctx_factory):
"i_inner,j_inner",
])
def test_matmul(ctx_factory, buffer_inames):
+ ctx = ctx_factory()
+
+ if (buffer_inames and
+ ctx.devices[0].platform.name == "Portable Computing Language"):
+ pytest.skip("crashes on pocl")
+
logging.basicConfig(level=logging.INFO)
fortran_src = """
@@ -316,7 +322,6 @@ def test_matmul(ctx_factory, buffer_inames):
knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames,
init_expression="0", store_expression="base+buffer")
- ctx = ctx_factory()
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 203ebb3922d3cc7f41b56abc31202b8974b88117..f9345d5b6cd9b97da80bb2ff8e5c6c657c199402 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -177,7 +177,7 @@ def test_simple_side_effect(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
- "{[i,j]: 0<=i,j<100}",
+ "{[i]: 0<=i<100}",
"""
a[i] = a[i] + 1
""",
@@ -456,7 +456,7 @@ def test_nonlinear_index(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
- "{[i,j]: 0<=i,j src_ibox = source_boxes[isrc_box]
@@ -769,7 +769,7 @@ def test_multiple_writes_to_local_temporary():
# writes are OK.
knl = lp.make_kernel(
- "{[i,e]: 0<=i<5 and 0<=e temp[i, 0] = 17
temp[i, 1] = 15
@@ -952,7 +952,7 @@ def test_atomic_init(dtype):
vec_width = 4
knl = lp.make_kernel(
- "{ [i,j]: 0<=i<100 }",
+ "{ [i]: 0<=i<100 }",
"""
out[i%4] = 0 {id=init, atomic=init}
""",
@@ -1555,7 +1555,7 @@ def test_finite_difference_expr_subst(ctx_factory):
gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True,
default_tag="l.auto")
- precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"})
+ precomp_knl = lp.tag_inames(precomp_knl, {"j_outer": "unr"})
precomp_knl = lp.set_options(precomp_knl, return_dict=True)
evt, _ = precomp_knl(queue, u=u, h=h)
@@ -1926,8 +1926,9 @@ def test_scalars_with_base_storage(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
+ import islpy as isl
knl = lp.make_kernel(
- "{ [i]: 0<=i<1}",
+ [isl.BasicSet("[] -> {[]: }")], # empty (domain w/unused inames errors)
"a = 1",
[lp.TemporaryVariable("a", dtype=np.float64,
shape=(), base_storage="base")])
diff --git a/test/test_misc.py b/test/test_misc.py
index 7a834a6f5d393298e97df22d47a1de3b64354a42..dc5045fe0f7a3756d9a70a52d0a0c3dbb92f3e69 100644
--- a/test/test_misc.py
+++ b/test/test_misc.py
@@ -24,7 +24,6 @@ THE SOFTWARE.
import six # noqa
import pytest
-from six.moves import range
import sys
@@ -35,50 +34,6 @@ logger = logging.getLogger(__name__)
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa
-def test_compute_sccs():
- from loopy.tools import compute_sccs
- import random
-
- rng = random.Random(0)
-
- def generate_random_graph(nnodes):
- graph = dict((i, set()) for i in range(nnodes))
- for i in range(nnodes):
- for j in range(nnodes):
- # Edge probability 2/n: Generates decently interesting inputs.
- if rng.randint(0, nnodes - 1) <= 1:
- graph[i].add(j)
- return graph
-
- def verify_sccs(graph, sccs):
- visited = set()
-
- def visit(node):
- if node in visited:
- return []
- else:
- visited.add(node)
- result = []
- for child in graph[node]:
- result = result + visit(child)
- return result + [node]
-
- for scc in sccs:
- scc = set(scc)
- assert not scc & visited
- # Check that starting from each element of the SCC results
- # in the same set of reachable nodes.
- for scc_root in scc:
- visited.difference_update(scc)
- result = visit(scc_root)
- assert set(result) == scc, (set(result), scc)
-
- for nnodes in range(10, 20):
- for i in range(40):
- graph = generate_random_graph(nnodes)
- verify_sccs(graph, compute_sccs(graph))
-
-
def test_SetTrie():
from loopy.kernel.tools import SetTrie
diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py
index e022e92f3712d984c1ad68061d0052240ff9d20c..54c64e0a4d4a23b429eb83be6c0a19f482a1b922 100644
--- a/test/test_sem_reagan.py
+++ b/test/test_sem_reagan.py
@@ -48,7 +48,7 @@ def test_tim2d(ctx_factory):
# K - run-time symbolic
knl = lp.make_kernel(
- "{[i,j,e,m,o,o2,gi]: 0<=i,j,m,o,o2 {[]: }")],
"""
a, b = make_tuple(1, 2.)
""")
diff --git a/test/test_transform.py b/test/test_transform.py
index cdc0c14b8bacc4fe5279d000461c0ea2244af021..ffef893b05fbca5a0d244ff17f379e1bb5cf27a1 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -74,7 +74,7 @@ def test_collect_common_factors(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
- "{[i,j,k]: 0<=i,j out_tmp = 0 {id=out_init,inames=i}
out_tmp = out_tmp + alpha[i]*a[i,j]*b1[j] {id=out_up1,dep=out_init}
@@ -385,7 +385,7 @@ def test_precompute_nested_subst(ctx_factory):
ctx = ctx_factory()
knl = lp.make_kernel(
- "{[i,j]: 0<=i 1:
exec(sys.argv[1])