Compare revisions

d0f46221 · 5eedbe24 · bae291e6 · 1f627eae · 8b00e088 · e4643236
--- a/.ci/examples-funcs.sh
+++ b/.ci/examples-funcs.sh
+#! /bin/bash
+
+set -e
+
+function install_example_prereqs()
+{
+  # ipython_genutils for https://github.com/jupyter/nbconvert/issues/1725
+  # jinja < 3.1 for https://github.com/jupyter/nbconvert/issues/1736
+  with_echo pip install \
+          matplotlib ipykernel nbconvert ipython_genutils 'jinja2 < 3.1'
+
+  install_ispc
+}
+
+function run_examples()
+{
+  PATTERN=$1
+  CMDLINE=$2
+  for i in $(find examples -name "$PATTERN" -print ); do
+    echo "-----------------------------------------------------------------------"
+    echo "RUNNING $i"
+    echo "-----------------------------------------------------------------------"
+    dn=$(dirname "$i")
+    bn=$(basename "$i")
+    (cd $dn; echo $CMDLINE "$bn"; $CMDLINE "$bn")
+  done
+}
+
+function run_py_examples()
+{
+  run_examples "*.py" ${PY_EXE}
+}
+function run_ipynb_examples()
+{
+  run_examples "*.ipynb" "${PY_EXE} -m nbconvert --to html --execute"
+}
+function run_floopy_examples()
+{
+  run_examples "*.floopy" "${PY_EXE} -m loopy"
+}
+
--- a/.editorconfig
+++ b/.editorconfig
+# https://editorconfig.org/
+# https://github.com/editorconfig/editorconfig-vim 
+# https://github.com/editorconfig/editorconfig-emacs 
+
+root = true
+
+[*]
+indent_style = space
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+[*.py]
+indent_size = 4
+
+[*.rst]
+indent_size = 4
+
+[*.cpp]
+indent_size = 2
+
+[*.hpp]
+indent_size = 2
+
+# There may be one in doc/
+[Makefile]
+indent_style = tab
+
+# https://github.com/microsoft/vscode/issues/1679
+[*.md]
+trim_trailing_whitespace = false
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
+version: 2
+updates:
+    # Set update schedule for GitHub Actions
+    - package-ecosystem: "github-actions"
+      directory: "/"
+      schedule:
+          interval: "weekly"
+
+# vim: sw=4
--- a/.github/workflows/autopush.yml
+++ b/.github/workflows/autopush.yml
+name: Gitlab mirror
+on:
+    push:
+        branches:
+        - main
+
+jobs:
+    autopush:
+        name: Automatic push to gitlab.tiker.net
+        if: startsWith(github.repository, 'inducer/')
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   run: |
+                curl -L -O https://tiker.net/ci-support-v0
+                . ./ci-support-v0
+                mirror_github_to_gitlab
+
+            env:
+                GITLAB_AUTOPUSH_KEY: ${{ secrets.GITLAB_AUTOPUSH_KEY }}
+
+# vim: sw=4
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
+name: CI
+on:
+    push:
+        branches:
+        - main
+    pull_request:
+    schedule:
+        - cron:  '17 3 * * 0'
+
+concurrency:
+    group: ${{ github.head_ref || github.ref_name }}
+    cancel-in-progress: true
+
+jobs:
+    ruff:
+        name: Ruff
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+            with:
+                submodules: true
+        -   uses: astral-sh/setup-uv@v5
+        -   name: "Main Script"
+            run: |
+                uv run --only-dev ruff check
+
+    typos:
+        name: Typos
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   uses: crate-ci/typos@master
+
+    pylint:
+        name: Pylint
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   name: "Main Script"
+            run: |
+                sed 's/python=3/python=3.7/' .test-conda-env-py3.yml > .test-conda-env.yml
+                USE_CONDA_BUILD=1
+                curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-pylint.sh
+                . ./prepare-and-run-pylint.sh "$(basename $GITHUB_REPOSITORY)" test/test_*.py
+
+    mypy:
+        name: Mypy
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   name: "Main Script"
+            run: |
+                EXTRA_INSTALL="mypy pytest types-colorama types-Pygments"
+                curl -L -O https://tiker.net/ci-support-v0
+                . ./ci-support-v0
+
+                build_py_project_in_conda_env
+                ./run-mypy.sh
+
+    pytest:
+        name: Conda Pytest
+        runs-on: ${{ matrix.os }}
+        strategy:
+          matrix:
+            os: [ubuntu-latest, macos-latest]
+        steps:
+        -   uses: actions/checkout@v4
+        -   name: "Main Script"
+            run: |
+                curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project-within-miniconda.sh
+                . ./build-and-test-py-project-within-miniconda.sh
+
+    pytest_intel:
+        name: Conda Pytest with Intel CL
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   name: "Main Script"
+            run: |
+                curl -L -O https://raw.githubusercontent.com/illinois-scicomp/machine-shop-maintenance/main/install-intel-icd.sh
+                sudo bash ./install-intel-icd.sh
+                CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+                echo "- ocl-icd-system" >> "$CONDA_ENVIRONMENT"
+                sed -i "/pocl/ d" "$CONDA_ENVIRONMENT"
+                export PYOPENCL_TEST=intel
+                source /opt/enable-intel-cl.sh
+                curl -L -O https://tiker.net/ci-support-v0
+                . ./ci-support-v0
+                build_py_project_in_conda_env
+                test_py_project
+
+    pytest_no_arg_check:
+        name: Conda Pytest without arg check
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   name: "Main Script"
+            run: |
+                curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project-within-miniconda.sh
+                export _LOOPY_SKIP_ARG_CHECKS=1
+                . ./build-and-test-py-project-within-miniconda.sh
+
+    pytest_twice:
+        name: Conda Pytest Twice (for cache behavior)
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   name: "Main Script"
+            run: |
+                # This test makes sure that loopy can run with kernels loaded from disk cache.
+                curl -L -O https://tiker.net/ci-support-v0
+                . ./ci-support-v0
+                build_py_project_in_conda_env
+                ( test_py_project )
+
+                # See https://github.com/inducer/loopy/pull/828 why this is disabled.
+                # export LOOPY_ABORT_ON_CACHE_MISS=1
+
+                ( test_py_project )
+
+    examples:
+        name: Conda Examples
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   name: "Main Script"
+            run: |
+                curl -L -O https://tiker.net/ci-support-v0
+                . ./ci-support-v0
+                build_py_project_in_conda_env
+                rewrite_pyopencl_test
+
+                . ./.ci/examples-funcs.sh
+                install_example_prereqs
+                run_py_examples
+                run_ipynb_examples
+                run_floopy_examples
+
+    docs:
+        name: Documentation
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -
+            uses: actions/setup-python@v5
+            with:
+                python-version: '3.x'
+        -   name: "Main Script"
+            run: |
+                curl -L -O https://tiker.net/ci-support-v0
+                . ci-support-v0
+                build_py_project_in_conda_env
+                build_docs
+
+    downstream_tests:
+        strategy:
+            matrix:
+                downstream_project: [arraycontext, meshmode, grudge, pytential, pytato]
+            fail-fast: false
+        name: Tests for downstream project ${{ matrix.downstream_project }}
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   name: "Main Script"
+            env:
+                DOWNSTREAM_PROJECT: ${{ matrix.downstream_project }}
+            run: |
+                curl -L -O https://tiker.net/ci-support-v0
+                . ./ci-support-v0
+                test_downstream "$DOWNSTREAM_PROJECT"
+
+    downstream_firedrake:
+        name: Tests for downstream project Firedrake
+        runs-on: ubuntu-latest
+        container:
+            image: 'firedrakeproject/firedrake'
+        steps:
+        -   name: "Main script"
+            run: |
+                cd /root
+                python3 -m venv --system-site-packages myvenv
+
+                export HOME="$(pwd)"
+                mkdir loopy
+                cd loopy
+                git init
+
+                git remote add origin "https://github.com/$GITHUB_REPOSITORY.git"
+                git fetch origin "$GITHUB_REF"
+                git checkout FETCH_HEAD
+
+                git submodule update --init
+
+                . /root/myvenv/bin/activate
+
+                pip install --editable .
+                pip uninstall -y pytools
+                pip uninstall -y pymbolic
+                pip install "git+https://github.com/inducer/pytools.git#egg=pytools"
+                pip install "git+https://github.com/inducer/pymbolic.git#egg=pymbolic"
+
+                cd /opt/firedrake
+
+                # patch so exception messages get shown
+                curl -L https://gist.githubusercontent.com/inducer/17d7134ace215f0df1f3627eac4195c7/raw/ec5470a7d8587b6e1f336f3ef1d0ece5e26f236a/firedrake-debug-patch.diff | patch -p1
+
+                sed -i 's/@mpiexec/@mpiexec --oversubscribe/' Makefile
+                make check
+
+    validate_cff:
+            name: Validate CITATION.cff
+            runs-on: ubuntu-latest
+            steps:
+            -   uses: actions/checkout@v4
+            -   uses: actions/setup-python@v5
+            -   run: |
+                    pip install cffconvert
+                    cffconvert -i CITATION.cff --validate
+
+# vim: sw=4
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,22 @@ distribute*egg
 distribute*tar.gz
 *.log
 *profiler.conf
+core
+.coverage
+htmlcov
+.ipynb_checkpoints
+lextab.py
+yacctab.py
+.pytest_cache/*
+
+.cache
+.env
+virtualenv-[0-9]*[0-9]
+
+*.so
+
+.asv
+
+# Files used by run-pylint.sh
+.pylintrc.yml
+.run-pylint.py
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+Pytest POCL:
+  script:
+  - export PYOPENCL_TEST=portable:pthread
+  - export EXTRA_INSTALL="pybind11 numpy mako"
+  - export LOOPY_NO_CACHE=1
+  - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
+  - ". ./build-and-test-py-project.sh"
+  tags:
+  - python3
+  - pocl
+  except:
+  - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
+Pytest Nvidia Titan V:
+  script:
+  - export PYOPENCL_TEST=nvi:titan
+  - export EXTRA_INSTALL="pybind11 numpy mako"
+  - export LOOPY_NO_CACHE=1
+  - source /opt/enable-intel-cl.sh
+  - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
+  - ". ./build-and-test-py-project.sh"
+  tags:
+  - python3
+  - nvidia-titan-v
+  except:
+  - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
+Pytest POCL without arg check:
+  script:
+  - export PYOPENCL_TEST=portable:pthread
+  - export EXTRA_INSTALL="pybind11 numpy mako"
+  - export LOOPY_NO_CACHE=1
+  - export _LOOPY_SKIP_ARG_CHECKS=1
+  - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
+  - ". ./build-and-test-py-project.sh"
+  tags:
+  - python3
+  - pocl
+  except:
+  - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
+Pytest Intel:
+  script:
+  - export PYOPENCL_TEST=intel
+  - export EXTRA_INSTALL="pybind11 numpy mako"
+  - export LOOPY_NO_CACHE=1
+  - export LOOPY_INTEL_CL_OK_FOR_TEST_REF=1
+  - source /opt/enable-intel-cl.sh
+  - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
+  - ". ./build-and-test-py-project.sh"
+  tags:
+  - python3
+  - intel-cl-cpu
+  except:
+  - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
+
+Pytest POCL Twice With Cache:
+  script: |
+    export PYOPENCL_TEST=portable:pthread
+    export EXTRA_INSTALL="pybind11 numpy mako"
+    curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/ci-support.sh
+    . ./ci-support.sh
+    build_py_project_in_venv
+    ( test_py_project )
+    ( test_py_project )
+  tags:
+  - python3
+  - pocl
+  except:
+  - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
+
+# PyPy POCL:
+#   script:
+#   - export PY_EXE=pypy
+#   - export PYOPENCL_TEST=portable:pthread
+#   - export EXTRA_INSTALL="pybind11 numpy mako"
+#   - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
+#   - ". ./build-and-test-py-project.sh"
+#   tags:
+#   - pypy
+#   - pocl
+#   except:
+#   - tags
+
+Pytest POCL Examples:
+  script: |
+    export PYOPENCL_TEST=portable:pthread
+    export EXTRA_INSTALL="pybind11 numpy mako"
+
+    curl -L -O https://tiker.net/ci-support-v0
+    . ./ci-support-v0
+    build_py_project_in_venv
+    rewrite_pyopencl_test
+
+    . ./.ci/examples-funcs.sh
+    install_example_prereqs
+    run_py_examples
+    run_ipynb_examples
+    run_floopy_examples
+  tags:
+  - python3
+  - pocl
+  - large-node
+  # For examples/python/ispc-stream-harness.py
+  - avx2
+  except:
+  - tags
+
+Pylint:
+  script:
+  # Needed to avoid name shadowing issues when running from source directory.
+  - PROJECT_INSTALL_FLAGS="--editable"
+  - EXTRA_INSTALL="pybind11 numpy mako matplotlib ipykernel ply fparser"
+  - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-pylint.sh
+  - . ./prepare-and-run-pylint.sh "$CI_PROJECT_NAME" test/test_*.py
+  tags:
+  - python3
+  except:
+  - tags
+
+Documentation:
+  script: |
+    EXTRA_INSTALL="pybind11 numpy"
+    curl -L -O https://tiker.net/ci-support-v0
+    . ci-support-v0
+    build_py_project_in_venv
+    build_docs
+    build_asv_html
+    maybe_upload_docs
+  tags:
+  - python3
+
+Ruff:
+  script:
+  - pipx install uv
+  - uv run --only-dev ruff check
+  tags:
+  - docker-runner
+  except:
+  - tags
+
+Mypy:
+  script: |
+    EXTRA_INSTALL="mypy pybind11 numpy types-colorama types-Pygments"
+    curl -L -O https://tiker.net/ci-support-v0
+    . ./ci-support-v0
+    build_py_project_in_venv
+    ./run-mypy.sh
+  tags:
+  - python3
+  except:
+  - tags
+
+Downstream:
+  parallel:
+    matrix:
+    - DOWNSTREAM_PROJECT: [arraycontext, meshmode, grudge, pytential, pytato]
+  tags:
+  - large-node
+  - "docker-runner"
+  script: |
+    curl -L -O https://tiker.net/ci-support-v0
+    . ./ci-support-v0
+    test_downstream "$DOWNSTREAM_PROJECT"
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "loopy/target/c/compyte"]
+	path = loopy/target/c/compyte
+	url = https://github.com/inducer/compyte.git
--- a/.pylintrc-local.yml
+++ b/.pylintrc-local.yml
+- arg: extension-pkg-whitelist
+  val: islpy
+
+- arg: ignore
+  val:
+  - compyte
+
+- arg: ignored-modules
+  val:
+  - IPython
+  - pycuda
+  - maptlotlib
+  - maptlotlib.pyplot
+
+- arg: init-hook
+  val: import sys; sys.setrecursionlimit(5000)
+
+- arg: disable
+  val:
+  - E1102
--- a/.test-conda-env-py3.yml
+++ b/.test-conda-env-py3.yml
+name: test-conda-env
+channels:
+- conda-forge
+- nodefaults
+
+dependencies:
+- python=3
+- git
+- numpy
+- pocl
+- mako
+- pyopencl
+- islpy
--- a/CITATION.cff
+++ b/CITATION.cff
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+
+# major contributors
+
+- family-names: "Kloeckner"
+  given-names: "Andreas"
+  orcid: "https://orcid.org/0000-0003-1228-519X"
+- family-names: Kulkarni
+  given-names: Kaushik
+  email:  kaushikcfd@gmail.com
+- family-names: Kempf
+  given-names: Dominic
+  email: dominic.r.kempf@gmail.com
+- family-names: Wala
+  given-names: Matt
+  email: wala1@illinois.edu
+- family-names: Curtis
+  given-names: Nick
+  email: arghdos@gmail.com
+- family-names: Stevens
+  given-names: James
+  email: jdsteve2@illinois.edu
+- family-names: Fernando
+  given-names: Isuru
+  email: isuruf@gmail.com
+
+# smaller fixes
+
+- family-names: Mitchell
+  given-names: Lawrence
+  email: lawrence@wence.uk
+- family-names: Alvey-Blanco
+  given-names: Addison J.
+  email: aalveyblanco@gmail.com
+- family-names: Fikl
+  given-names: Alexandru
+  email: alexfikl@gmail.com
+- family-names: Malone
+  given-names: Chris
+  email: chris.m.malone@gmail.com
+- family-names: Ward
+  given-names: Connor
+  email: c.ward20@imperial.ac.uk
+- family-names: Wilcox
+  given-names: Lucas C.
+  email: lucas@swirlee.com
+- family-names: Koch
+  given-names: Marcel
+  email: marcel.koch@uni-muenster.de
+- family-names: Woodman
+  given-names: Marmaduke
+  email: marmaduke.woodman@univ-amu.fr
+- family-names: Smith
+  given-names: Matthew
+  email: mjsmith6@illinois.edu
+- family-names: Diener
+  given-names: Matthias
+  email: mdiener@illinois.edu
+- family-names: Christensen
+  given-names: Nicholas
+  email: njchris2@illinois.edu
+- family-names: Nykto
+  given-names: Nicolas
+  email: nnytko2@illinois.edu
+- family-names: Kirby
+  given-names: Robert C.
+  email: Robert_Kirby@baylor.edu
+- family-names: Hegmann
+  given-names: Sebastian
+  email: shegmann@nina.iwr.uni-heidelberg.de
+- family-names: Vorderwuelbecke
+  given-names: Sophia
+  email: sv2518@ic.ac.uk
+- family-names: Ratnayaka
+  given-names: Thilina
+  email: thilinarmtb@gmail.com
+- family-names: Gibson
+  given-names: Thomas
+  email: gibsonthomas1120@hotmail.com
+- family-names: Sun
+  given-names: Tianjiao
+  email: tj-sun@tianjiaos-air.home
+- family-names: Smith
+  given-names: Timothy A.
+  email: tasmith4@illinois.edu
+- family-names: Warburton
+  given-names: Tim
+  email: timwar@caam.rice.edu
+- family-names: Wei
+  given-names: Xiaoyu
+  email: wxy0516@gmail.com
+- family-names: Weiner
+  given-names: Zach
+  email: zachjweiner@gmail.com
+
+title: "Loopy"
+version: 2024.1
+date-released: 2024-02-16
+url: "https://github.com/inducer/loopy"
+doi: 10.5281/zenodo.10672275
+license: MIT
--- a/LICENSE
+++ b/LICENSE
+MIT License
+
+Copyright (c) 2018 Andreas Klöckner and contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/MEMO
+++ b/MEMO
@@ -7,7 +7,7 @@ Documentation Notes
 Things to consider
 ^^^^^^^^^^^^^^^^^^

- Depedencies are pointwise for shared loop dimensions
+- Dependencies are pointwise for shared loop dimensions
  and global over non-shared ones (between dependent and ancestor)

 - multiple insns could fight over which iname gets local axis 0
@@ -16,16 +16,17 @@ Things to consider
 - Every loop in loopy is opened at most once.
  Too restrictive?

- Loop bounds currently may not depend on parallel dimensions
-  Does it make sense to relax this?
-
- Why do CSEs necessarily have to duplicate the inames?
+- Why do precomputes necessarily have to duplicate the inames?
  -> because that would be necessary for a sequential prefetch

 - Cannot do slab decomposition on inames that share a tag with
  other inames
  -> Is that reasonable?

+- Entering a loop means:
+  - setting up conditionals related to it (slabs/bounds)
+  - allowing loops nested inside to depend on loop state
+
 - Not using all hw loop dimensions causes an error, as
  is the case for variant 3 in the rank_one test.

@@ -40,17 +41,43 @@ Things to consider

 - Limitation: base index for parallel axes is 0.

+- Dependency on order of operations is ill-formed
+
+- Dependency on non-local global writes is ill-formed
+
+- No substitution rules allowed on lhs of insns
+
 To-do
 ^^^^^

- Kernel splitting (via what variables get computed in a kernel)
+- Kernel fusion
+
+- when are link_inames, duplicate_inames safe?
+
+- rename IndexTag -> InameTag
+
+- Data implementation tags
+  - turn base_indices into offset
+  - vectorization
+  - write_image()
+  - change_arg_to_image (test!)

 - Make tests run on GPUs

- Fuse: store/fetch elimination?
+- Test array access with modulo
+
+- Derive all errors from central hierarchy
+
+- Provide context for more errors?
+
+- Allow mixing computed and stored strides

 Fixes:

+- applied_iname_rewrites tracking for prefetch footprints isn't bulletproof
+  old inames may still be around, so the rewrite may or may not have to be
+  applied.
+
 - Group instructions by dependency/inames for scheduling, to
  increase sched. scalability

@@ -62,7 +89,15 @@ Fixes:
 Future ideas
 ^^^^^^^^^^^^

- Expose iname-duplicate-and-rename as a primitive.
+- subtract_domain_lower_bound
+
+- Storage sharing for temporaries?
+
+- Kernel splitting (via what variables get computed in a kernel)
+
+- Put all OpenCL functions into mangler
+
+- Fuse: store/fetch elimination?

 - Array language

@@ -80,9 +115,6 @@ Future ideas

 - Float4 joining on fetch/store?

- How can one automatically generate something like microblocks?
-  -> Some sort of axis-adding transform?
-
 - Better for loop bound generation
  -> Try a triangular loop

@@ -107,6 +139,33 @@ Future ideas
 Dealt with
 ^^^^^^^^^^

+- How can one automatically generate something like microblocks?
+  -> Some sort of axis-adding transform?
+
+- RuleAwareIdentityMapper
+  extract_subst -> needs WalkMapper [actually fine as is]
+  padding [DONE]
+  replace make_unique_var_name [DONE]
+  join_inames [DONE]
+  duplicate_inames [DONE]
+  split_iname [DONE]
+  CSE [DONE]
+
+- rename iname
+
+- delete unused inames
+
+- Expose iname-duplicate-and-rename as a primitive.
+
+- make sure simple side effects work
+
+- Loop bounds currently may not depend on parallel dimensions
+  Does it make sense to relax this?
+
+- Streamline argument specification
+
+- syntax for linear array access
+
 - Test divisibility constraints

 - Test join_inames

--- a/README.rst
+++ b/README.rst
+Loopy: Transformation-Based Generation of High-Performance CPU/GPU Code
+=======================================================================
+
+.. image:: https://gitlab.tiker.net/inducer/loopy/badges/main/pipeline.svg
+    :alt: Gitlab Build Status
+    :target: https://gitlab.tiker.net/inducer/loopy/commits/main
+.. image:: https://github.com/inducer/loopy/actions/workflows/ci.yml/badge.svg
+    :alt: Github Build Status
+    :target: https://github.com/inducer/loopy/actions/workflows/ci.yml
+.. image:: https://badge.fury.io/py/loopy.svg
+    :alt: Python Package Index Release Page
+    :target: https://pypi.org/project/loopy/
+.. image:: https://zenodo.org/badge/20281732.svg
+    :alt: Zenodo DOI for latest release
+    :target: https://zenodo.org/doi/10.5281/zenodo.10672274
+
+Loopy lets you easily generate the tedious, complicated code that is necessary
+to get good performance out of GPUs and multi-core CPUs.
+Loopy's core idea is that a computation should be described simply and then
+*transformed* into a version that gets high performance. This transformation
+takes place under user control, from within Python.
+
+It can capture the following types of optimizations:
+
+* Vector and multi-core parallelism in the OpenCL/CUDA model
+* Data layout transformations (structure of arrays to array of structures)
+* Loop unrolling
+* Loop tiling with efficient handling of boundary cases
+* Prefetching/copy optimizations
+* Instruction level parallelism
+* and many more!
+
+Loopy targets array-type computations, such as the following:
+
+* dense linear algebra,
+* convolutions,
+* n-body interactions,
+* PDE solvers, such as finite element, finite difference, and
+  Fast-Multipole-type computations.
+
+It is not (and does not want to be) a general-purpose programming language.
+
+Loopy is licensed under the liberal `MIT license
+<https://en.wikipedia.org/wiki/MIT_License>`__ and free for commercial, academic,
+and private use. All of Loopy's dependencies can be automatically installed from
+the package index after using::
+
+    pip install loopy
+
+In addition, Loopy is compatible with and enhances
+`pyopencl <https://mathema.tician.de/software/pyopencl>`__.
+
+---
+
+Places on the web related to Loopy:
+
+* `Python Package Index <https://pypi.org/project/loopy>`__ (download releases)
+* `Documentation <https://documen.tician.de/loopy>`__ (read how things work)
+* `Github <https://github.com/inducer/loopy>`__ (get latest source code, file bugs)
+* `Homepage <https://mathema.tician.de/software/loopy>`__
+* `Benchmarks <https://documen.tician.de/loopy/benchmarks>`__
--- a/TODO
+++ b/TODO
+FORTRAN:
+do/continue
+case sensitivity
+
--- a/asv.conf.json
+++ b/asv.conf.json
+{
+    // The version of the config file format.  Do not change, unless
+    // you know what you are doing.
+    "version": 1,
+
+    // The name of the project being benchmarked
+    "project": "loopy",
+
+    // The project's homepage
+    "project_url": "https://documen.tician.de/loopy",
+
+    // The URL or local path of the source code repository for the
+    // project being benchmarked
+    "repo": ".",
+
+    // The Python project's subdirectory in your repo.  If missing or
+    // the empty string, the project is assumed to be located at the root
+    // of the repository.
+    // "repo_subdir": "",
+
+    // List of branches to benchmark. If not provided, defaults to "master"
+    // (for git) or "default" (for mercurial).
+    "branches": ["main"],    // for git
+
+    // The DVCS being used.  If not set, it will be automatically
+    // determined from "repo" by looking at the protocol in the URL
+    // (if remote), or by looking for special directories, such as
+    // ".git" (if local).
+    // "dvcs": "git",
+
+    // The tool to use to create environments.  May be "conda",
+    // "virtualenv" or other value depending on the plugins in use.
+    // If missing or the empty string, the tool will be automatically
+    // determined by looking for tools on the PATH environment
+    // variable.
+    "environment_type": "conda",
+
+    // timeout in seconds for installing any dependencies in environment
+    // defaults to 10 min
+    //"install_timeout": 600,
+
+    // the base URL to show a commit for the project.
+    "show_commit_url": "http://github.com/inducer/loopy/commit/",
+
+    // The Pythons you'd like to test against.  If not provided, defaults
+    // to the current version of Python used to run `asv`.
+    // "pythons": ["2.7", "3.6"],
+
+    // The list of conda channel names to be searched for benchmark
+    // dependency packages in the specified order
+    "conda_channels": ["conda-forge", "defaults"],
+
+    // The matrix of dependencies to test.  Each key is the name of a
+    // package (in PyPI) and the values are version numbers.  An empty
+    // list or empty string indicates to just test against the default
+    // (latest) version. null indicates that the package is to not be
+    // installed. If the package to be tested is only available from
+    // PyPi, and the 'environment_type' is conda, then you can preface
+    // the package name by 'pip+', and the package will be installed via
+    // pip (with all the conda available packages installed first,
+    // followed by the pip installed packages).
+    //
+    // "matrix": {
+    //     "numpy": ["1.6", "1.7"],
+    //     "six": ["", null],        // test with and without six installed
+    //     "pip+emcee": [""],   // emcee is only available for install with pip.
+    // },
+    "matrix": {
+        "numpy" : [""],
+        "pyopencl" : [""],
+        "islpy" : [""],
+        "pocl" : [""],
+        "pip+git+https://github.com/inducer/pymbolic#egg=pymbolic": [""],
+        "pip+git+https://github.com/inducer/boxtree#egg=boxtree": [""],
+        "pip+git+https://github.com/inducer/loopy#egg=loopy": [""],
+        "pip+git+https://github.com/inducer/sumpy#egg=sumpy": [""],
+    },
+
+    // Combinations of libraries/python versions can be excluded/included
+    // from the set to test. Each entry is a dictionary containing additional
+    // key-value pairs to include/exclude.
+    //
+    // An exclude entry excludes entries where all values match. The
+    // values are regexps that should match the whole string.
+    //
+    // An include entry adds an environment. Only the packages listed
+    // are installed. The 'python' key is required. The exclude rules
+    // do not apply to includes.
+    //
+    // In addition to package names, the following keys are available:
+    //
+    // - python
+    //     Python version, as in the *pythons* variable above.
+    // - environment_type
+    //     Environment type, as above.
+    // - sys_platform
+    //     Platform, as in sys.platform. Possible values for the common
+    //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+    //
+    // "exclude": [
+    //     {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
+    //     {"environment_type": "conda", "six": null}, // don't run without six on conda
+    // ],
+    //
+    // "include": [
+    //     // additional env for python2.7
+    //     {"python": "2.7", "numpy": "1.8"},
+    //     // additional env if run on windows+conda
+    //     {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""},
+    // ],
+
+    // The directory (relative to the current directory) that benchmarks are
+    // stored in.  If not provided, defaults to "benchmarks"
+    // "benchmark_dir": "benchmarks",
+
+    // The directory (relative to the current directory) to cache the Python
+    // environments in.  If not provided, defaults to "env"
+    "env_dir": ".asv/env",
+
+    // The directory (relative to the current directory) that raw benchmark
+    // results are stored in.  If not provided, defaults to "results".
+    "results_dir": ".asv/results",
+
+    // The directory (relative to the current directory) that the html tree
+    // should be written to.  If not provided, defaults to "html".
+    "html_dir": ".asv/html",
+
+    // The number of characters to retain in the commit hashes.
+    // "hash_length": 8,
+
+    // `asv` will cache wheels of the recent builds in each
+    // environment, making them faster to install next time.  This is
+    // number of builds to keep, per environment.
+    // "wheel_cache_size": 0
+
+    // The commits after which the regression search in `asv publish`
+    // should start looking for regressions. Dictionary whose keys are
+    // regexps matching to benchmark names, and values corresponding to
+    // the commit (exclusive) after which to start looking for
+    // regressions.  The default is to start from the first commit
+    // with results. If the commit is `null`, regression detection is
+    // skipped for the matching benchmark.
+    //
+    // "regressions_first_commits": {
+    //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
+    //    "another_benchmark": null,   // Skip regression detection altogether
+    // }
+
+    // The thresholds for relative change in results, after which `asv
+    // publish` starts reporting regressions. Dictionary of the same
+    // form as in ``regressions_first_commits``, with values
+    // indicating the thresholds.  If multiple entries match, the
+    // maximum is taken. If no entry matches, the default is 5%.
+    //
+    // "regressions_thresholds": {
+    //    "some_benchmark": 0.01,     // Threshold of 1%
+    //    "another_benchmark": 0.5,   // Threshold of 50%
+    // }
+}
--- a/bin/loopy
+++ b/bin/loopy
+#! /usr/bin/env python
+
+if __name__ == "__main__":
+    import loopy.cli
+    loopy.cli.main()
--- a/contrib/c-integer-semantics.py
+++ b/contrib/c-integer-semantics.py
+#!/usr/bin/env python
+
+import ctypes
+from os import system
+
+
+C_SRC = """
+#include <stdlib.h>
+#include <stdint.h>
+
+int64_t cdiv(int64_t a, int64_t b)
+{
+    return a/b;
+}
+
+int64_t cmod(int64_t a, int64_t b)
+{
+    return a%b;
+}
+
+#define LOOPY_CALL_WITH_INTEGER_TYPES(MACRO_NAME) \
+    MACRO_NAME(int8, char) \
+    MACRO_NAME(int16, short) \
+    MACRO_NAME(int32, int) \
+    MACRO_NAME(int64, long long)
+
+#define LOOPY_DEFINE_FLOOR_DIV(SUFFIX, TYPE) \
+    TYPE loopy_floor_div_##SUFFIX(TYPE a, TYPE b) \
+    { \
+        if ((a<0) != (b<0)) \
+            a = a - (b + (b<0) - (b>=0)); \
+        return a/b; \
+    }
+
+LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV)
+#undef LOOPY_DEFINE_FLOOR_DIV
+
+#define LOOPY_DEFINE_FLOOR_DIV_POS_B(SUFFIX, TYPE) \
+    TYPE loopy_floor_div_pos_b_##SUFFIX(TYPE a, TYPE b) \
+    { \
+        if (a<0) \
+            a = a - (b-1); \
+        return a/b; \
+    }
+
+LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV_POS_B)
+#undef LOOPY_DEFINE_FLOOR_DIV_POS_B
+
+
+#define LOOPY_DEFINE_MOD_POS_B(SUFFIX, TYPE) \
+    TYPE loopy_mod_pos_b_##SUFFIX(TYPE a, TYPE b) \
+    { \
+        TYPE result = a%b; \
+        if (result < 0) \
+            result += b; \
+        return result; \
+    }
+
+LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_MOD_POS_B)
+#undef LOOPY_DEFINE_MOD_POS_B
+
+#define LOOPY_DEFINE_MOD(SUFFIX, TYPE) \
+    TYPE loopy_mod_##SUFFIX(TYPE a, TYPE b) \
+    { \
+        TYPE result = a%b; \
+        if (result < 0 && b > 0) \
+            result += b; \
+        if (result > 0 && b < 0) \
+            result = result + b; \
+        return result; \
+    }
+
+LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_MOD)
+#undef LOOPY_DEFINE_MOD
+
+
+"""
+
+
+def main():
+    with open("int-experiments.c", "w") as outf:
+        outf.write(C_SRC)
+
+    system("gcc -Wall -shared int-experiments.c -o int-experiments.so")
+
+    int_exp = ctypes.CDLL("int-experiments.so")
+    for func in [
+            int_exp.cdiv,
+            int_exp.cmod,
+            int_exp.loopy_floor_div_int64,
+            int_exp.loopy_floor_div_pos_b_int64,
+            int_exp.loopy_mod_pos_b_int64,
+            int_exp.loopy_mod_int64,
+            ]:
+        func.argtypes = [ctypes.c_longlong, ctypes.c_longlong]
+        func.restype = ctypes.c_longlong
+
+    cmod = int_exp.cmod
+    int_floor_div = int_exp.loopy_floor_div_int64
+    int_floor_div_pos_b = int_exp.loopy_floor_div_pos_b_int64
+    int_mod_pos_b = int_exp.loopy_mod_pos_b_int64
+    int_mod = int_exp.loopy_mod_int64
+
+    m = 50
+
+    for a in range(-m, m):
+        for b in range(1, m):
+            cresult = int_floor_div_pos_b(a, b)
+            presult = a // b
+            assert cresult == presult
+            if cresult != presult:
+                print(a, b, cresult, presult)
+
+    for a in range(-m, m):
+        for b in range(-m, m):
+            if b == 0:
+                continue
+
+            cresult = int_floor_div(a, b)
+            presult = a // b
+            assert cresult == presult
+            if cresult != presult:
+                print(a, b, cresult, presult)
+
+    for a in range(-m, m):
+        for b in range(1, m):
+            cresult = int_mod_pos_b(a, b)
+            presult = a % b
+            assert cresult == presult
+
+    for a in range(-m, m):
+        for b in range(-m, m):
+            if b == 0:
+                continue
+
+            cresult = int_mod(a, b)
+            presult = a % b
+            assert cresult == presult
+            if cresult != presult:
+                print(a, b, cresult, presult)
+
+    # print(int_mod(552, -918), 552 % -918)
+    print(cmod(23, -11), 23 % -11)
+
+
+if __name__ == "__main__":
+    main()
--- a/contrib/floopy-highlighting/floopy.vim
+++ b/contrib/floopy-highlighting/floopy.vim
+" Vim highlighting for Floopy (Fortran+Loopy) source code
+" -------------------------------------------------------
+" Installation:
+" Just drop this file into ~/.vim/syntax/floopy.vim
+"
+" Then do
+" :set filetype=floopy
+"
+" You may also include a line
+" vim: filetype=floopy.python
+" at the end of your file to set the file type automatically.
+"
+" Another option is to include the following in your .vimrc
+" au BufRead,BufNewFile *.floopy set filetype=floopy
+
+runtime! syntax/fortran.vim
+
+unlet b:current_syntax
+syntax include @LoopyPython syntax/python.vim
+
+if exists('s:current_syntax')
+  let b:current_syntax=s:current_syntax
+else
+  unlet b:current_syntax
+endif
+
+syntax region textSnipLoopyPython
+\ matchgroup=Comment
+\ start='$loopy begin' end='$loopy end'
+\ containedin=ALL
+\ contains=@LoopyPython
--- a/contrib/mem-pattern-explorer/pattern_vis.py
+++ b/contrib/mem-pattern-explorer/pattern_vis.py
+import numpy as np
+
+
+# Inspired by a visualization used in the Halide tutorial
+# https://www.youtube.com/watch?v=3uiEyEKji0M
+
+
+def div_ceil(nr, dr):
+    return -(-nr // dr)
+
+
+def product(iterable):
+    from functools import reduce
+    from operator import mul
+    return reduce(mul, iterable, 1)
+
+
+class ArrayAccessPatternContext:
+    def __init__(self, gsize, lsize, subgroup_size=32, decay_constant=0.75):
+        self.lsize = lsize
+        self.gsize = gsize
+        self.subgroup_size = subgroup_size
+        self.timestamp = 0
+        self.decay_constant = decay_constant
+
+        self.ind_length = len(gsize) + len(lsize)
+
+        self.arrays = []
+
+    def l(self, index):  # noqa: E743
+        subscript = [np.newaxis] * self.ind_length
+        subscript[len(self.gsize) + index] = slice(None)
+
+        return np.arange(self.lsize[index])[tuple(subscript)]
+
+    def g(self, index):
+        subscript = [np.newaxis] * self.ind_length
+        subscript[index] = slice(None)
+
+        return np.arange(self.gsize[index])[tuple(subscript)]
+
+    def nsubgroups(self):
+        return div_ceil(product(self.lsize), self.subgroup_size)
+
+    def animate(self, f, interval=200):
+        import matplotlib.animation as animation
+        import matplotlib.pyplot as plt
+
+        fig = plt.figure()
+
+        plots = []
+        for iary, ary in enumerate(self.arrays):
+            ax = fig.add_subplot(1, len(self.arrays), 1+iary)
+            ax.set_title(ary.name)
+            plots.append(ary.plot(ax))
+
+        def data_gen():
+            for _ in f():
+                self.tick()
+
+                for ary, plot in zip(self.arrays, plots):
+                    plot.set_array(ary.get_plot_data())
+
+                fig.canvas.draw()
+                yield plots
+
+        # must be kept alive until after plt.show()
+        return animation.FuncAnimation(
+                fig, lambda x: x, data_gen,
+                blit=False, interval=interval, repeat=True)
+
+    def tick(self):
+        self.timestamp += 1
+
+
+class Array:
+    def __init__(self, ctx, name, shape, strides, elements_per_row=None):
+        # Each array element stores a tuple:
+        # (timestamp, subgroup, g0, g1, g2, ) of last access
+
+        assert len(shape) == len(strides)
+
+        self.nattributes = 2+len(ctx.gsize)
+
+        if elements_per_row is None:
+            if len(shape) > 1:
+                minstride = min(strides)
+                for sh_i, st_i in zip(shape, strides):
+                    if st_i == minstride:
+                        elements_per_row = sh_i
+                        break
+        else:
+            elements_per_row = 256
+
+        self.array = np.zeros((product(shape), self.nattributes,), dtype=np.int32)
+
+        self.ctx = ctx
+        self.name = name
+        self.shape = shape
+        self.strides = strides
+        self.elements_per_row = elements_per_row
+
+        ctx.arrays.append(self)
+
+    def __getitem__(self, index):
+        if not isinstance(index, tuple):
+            index = (index,)
+
+        assert len(index) == len(self.shape)
+
+        all_subscript = (np.newaxis,) * self.ctx.ind_length
+
+        def reshape_ind(ind):
+            if not isinstance(ind, np.ndarray):
+                return ind[all_subscript]
+
+            else:
+                assert len(ind.shape) == self.ctx.ind_length
+
+        lin_index = sum(
+                ind_i * stride_i
+                for ind_i, stride_i in zip(index, self.strides))
+
+        if not isinstance(lin_index, np.ndarray):
+            subscript = [np.newaxis] * self.ctx.ind_length
+            lin_index = np.array(lin_index)[subscript]
+
+        self.array[lin_index, 0] = self.ctx.timestamp
+        for i, _glength in enumerate(self.ctx.gsize):
+            if lin_index.shape[i] > 1:
+                self.array[lin_index, 2+i] = self.ctx.g(i)
+
+        workitem_index = 0
+        for i in range(len(self.ctx.lsize))[::-1]:
+            workitem_index = (
+                    workitem_index * self.ctx.lsize[i]
+                    + self.ctx.l(i))
+        subgroup = workitem_index//self.ctx.subgroup_size
+        self.array[lin_index, 1] = subgroup
+
+    def __setitem__(self, index, value):
+        self.__getitem__(index)
+
+    def get_plot_data(self):
+        nelements = self.array.shape[0]
+        base_shape = (
+                div_ceil(nelements, self.elements_per_row),
+                self.elements_per_row,)
+        shaped_array = np.zeros(
+                (*base_shape, self.nattributes),
+                dtype=np.float32)
+        shaped_array.reshape(-1, self.nattributes)[:nelements] = self.array
+
+        modulation = np.exp(
+                -self.ctx.decay_constant*(self.ctx.timestamp-shaped_array[:, :, 0]))
+
+        subgroup = shaped_array[:, :, 1]
+        if self.ctx.nsubgroups() > 1:
+            subgroup = subgroup/(self.ctx.nsubgroups()-1)
+        else:
+            subgroup.fill(1)
+
+        rgb_array = np.zeros((*base_shape, 3))
+        if 1:
+            if len(self.ctx.gsize) > 1:
+                # g.0 -> red
+                rgb_array[:, :, 0] = shaped_array[:, :, 2]/(self.ctx.gsize[0]-1)
+            if len(self.ctx.gsize) > 1:
+                # g.1 -> blue
+                rgb_array[:, :, 2] = shaped_array[:, :, 3]/(self.ctx.gsize[1]-1)
+        if 1:
+            rgb_array[:, :, 1] = subgroup
+
+        return rgb_array*modulation[:, :, np.newaxis]
+
+    def plot(self, ax, **kwargs):
+        return ax.imshow(
+                self.get_plot_data(), interpolation="nearest",
+                **kwargs)
+
+
+def show_example():
+    n = 2**7
+    n16 = div_ceil(n, 16)
+    ctx = ArrayAccessPatternContext(gsize=(n16, n16), lsize=(16, 16))
+    in0 = Array(ctx, "in0", (n, n), (n, 1))
+
+    if 0:
+        # knl a
+        i_inner = ctx.l(1)
+        i_outer = ctx.g(1)
+        k_inner = ctx.l(0)
+
+        def f():
+            for k_outer in range(n16):
+                in0[i_inner + i_outer*16, k_inner + k_outer*16]
+                yield
+    elif 0:
+        # knl b
+        j_inner = ctx.l(0)
+        j_outer = ctx.g(0)
+        k_inner = ctx.l(1)
+
+        def f():
+            for k_outer in range(n16):
+                in0[k_inner + k_outer*16, j_inner + j_outer*16]
+                yield
+
+    ani = ctx.animate(f)
+    import matplotlib.pyplot as plt
+    if 1:
+        plt.show()
+    else:
+        ani.save("access.mp4")
+
+
+def show_example_2():
+    bsize = 8
+    blocks = 3
+
+    ctx = ArrayAccessPatternContext(gsize=(1,), lsize=(1,),
+            decay_constant=0.005)
+    in0 = Array(ctx, "in0", (blocks*bsize, blocks*bsize), (blocks*bsize, 1))
+
+    def f():
+        for i_outer in range(blocks):
+            for j_outer in range(blocks):
+                for i_inner in range(bsize):
+                    for j_inner in range(bsize):
+                        in0[i_inner + i_outer*bsize, j_inner + j_outer*bsize]
+                        yield
+
+    ani = ctx.animate(f, interval=10)
+    import matplotlib.pyplot as plt
+    if 1:
+        plt.show()
+    else:
+        ani.save("access.mp4")
+
+
+if __name__ == "__main__":
+    show_example_2()
No results found