From 679cbf3ac31a1bacd35fbdbd1e8d57e589711c41 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@illinois.edu>
Date: Sun, 27 Jan 2019 21:56:46 +0100
Subject: [PATCH 01/32] Initial commit

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 README.md
diff --git a/README.md b/README.md
new file mode 100644
index 000000000..846211dcb
--- /dev/null
+++ b/README.md
@@ -0,0 +1,2 @@
+# kernel_profiler
+
-- 
GitLab


From 55cd323e243a8f32d59bd0566683c44de63e56e6 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Sun, 27 Jan 2019 16:29:01 -0600
Subject: [PATCH 02/32] added initial version of kernel profiler that just
 times kernels

---
 examples/example.py         |  35 +++++++++
 kernel_profiler/__init__.py | 139 ++++++++++++++++++++++++++++++++++++
 kernel_profiler/version.py  |   1 +
 requirements.txt            |   4 ++
 setup.cfg                   |   6 ++
 setup.py                    |  28 ++++++++
 6 files changed, 213 insertions(+)
 create mode 100644 examples/example.py
 create mode 100644 kernel_profiler/__init__.py
 create mode 100644 kernel_profiler/version.py
 create mode 100644 requirements.txt
 create mode 100644 setup.cfg
 create mode 100644 setup.py

diff --git a/examples/example.py b/examples/example.py
new file mode 100644
index 000000000..ab9cb0353
--- /dev/null
+++ b/examples/example.py
@@ -0,0 +1,35 @@
+import loopy as lp
+import numpy as np
+from kernel_profiler import KernelProfiler
+from kernel_profiler import KernelStatOptions as stat_opts
+
+
+knl = lp.make_kernel(
+        "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
+        [
+            "c[i, j] = sum(k, a[i, k]*b[k, j])"
+        ],
+        name="matmul", assumptions="n,m,ell >= 1")
+
+knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
+
+lsize = 16
+knl = lp.split_iname(knl, "i", lsize, outer_tag="g.0", inner_tag="l.1")
+knl = lp.split_iname(knl, "j", lsize, outer_tag="g.1", inner_tag="l.0")
+knl = lp.split_iname(knl, "k", lsize)
+knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
+knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")
+
+n = 512
+m = 256
+ell = 128
+param_dict = {'n': n, 'm': m, 'ell': ell}
+
+kp = KernelProfiler("NVIDIA", "GEFORCE")
+stats = kp.get_stats(knl, [stat_opts.WALL_TIME], param_dict=param_dict)
+print(stats[stat_opts.WALL_TIME])
+
+interactive_kp = KernelProfiler(interactive=True)
+interactive_stats = interactive_kp.get_stats(
+        knl, [stat_opts.WALL_TIME], param_dict=param_dict)
+print(interactive_stats[stat_opts.WALL_TIME])
diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py
new file mode 100644
index 000000000..919aef41f
--- /dev/null
+++ b/kernel_profiler/__init__.py
@@ -0,0 +1,139 @@
+import pyopencl as cl
+import loopy as lp
+
+
+def create_rand_args(ctx, knl, param_dict):
+
+    queue = cl.CommandQueue(ctx)
+    info = lp.generate_code_v2(knl).implemented_data_info
+    args, arg_data = lp.auto_test.make_ref_args(
+                        knl,
+                        info,
+                        queue, param_dict)
+    args.clear()
+    del args
+
+    rand_args = lp.auto_test.make_args(knl, info,
+                                       queue, arg_data, param_dict)
+
+    del arg_data[:]
+    del arg_data
+    return rand_args
+
+
+def find_cl_device_candidates(platform_name, device_name):
+    candidates = [
+            dev
+            for plat in cl.get_platforms()
+            if platform_name.lower() in plat.name.lower()
+            for dev in plat.get_devices()
+            if device_name.lower() in dev.name.lower()]
+
+    if not candidates:
+        raise ValueError("no device matched the platform/device ID tuple "
+                "'%s' and '%s'" % (platform_name, device_name))
+
+    return candidates
+
+
+class KernelStatOptions:
+    WALL_TIME = "wall_time"
+    # TODO add other stat options here
+
+
+class KernelProfiler(object):
+
+    N_WARMUP_TIME_TRIALS = 4
+    N_TIME_TRIALS = 64
+
+    def __init__(
+                self,
+                platform_name=None,
+                device_name=None,
+                interactive=False,
+                ):
+        self.ctx_cache = {}
+        self.platform_name = platform_name
+        self.device_name = device_name
+        self.interactive = interactive
+
+    def get_cl_context(self):
+
+        if self.interactive:
+            return cl.create_some_context()
+        else:
+            if self.platform_name is None or self.device_name is None:
+                raise ValueError(
+                        "Wall time requires platform name, and device name.")
+
+            cache_key = (self.platform_name, self.device_name, "ctx")
+            try:
+                return self.ctx_cache[cache_key]
+            except KeyError:
+                ctx = cl.Context(
+                        [find_cl_device_candidates(self.platform_name, self.device_name)[-1]]
+                        )
+                self.ctx_cache[cache_key] = ctx
+                return ctx
+
+    def time_kernel(
+                self,
+                knl,
+                param_dict,
+                n_warmup_trials=None,
+                n_trials=None,
+                ):
+
+        n_warmup_trials = self.N_WARMUP_TIME_TRIALS if n_warmup_trials is None \
+                else n_warmup_trials
+        n_trials = self.N_TIME_TRIALS if n_trials is None else n_trials
+
+        ctx = self.get_cl_context()
+        queue = cl.CommandQueue(ctx)
+
+        arg_arrays = create_rand_args(ctx, knl, param_dict)
+        knl = lp.set_options(knl, no_numpy=True)
+        compiled = lp.CompiledKernel(ctx, knl)
+
+        wtimes = []
+
+        import time
+        for t in range(n_trials + n_warmup_trials):
+            queue.finish()
+            tstart = time.time()
+            evt, out = compiled(queue, **arg_arrays)
+            queue.finish()
+            tend = time.time()
+            wtimes.append(tend-tstart)
+
+        import numpy as np
+        return np.average(wtimes[n_warmup_trials:])
+
+    def get_stats(
+            self,
+            knl,
+            stat_options=[],
+            param_dict=None,
+            n_warmup_wtime_trials=None,
+            n_wtime_trials=None,
+            ):
+
+        stats_found = {}
+
+        if KernelStatOptions.WALL_TIME in stat_options:
+
+            n_warmup_wtime_trials = self.N_WARMUP_TIME_TRIALS \
+                    if n_warmup_wtime_trials is None else n_warmup_wtime_trials
+            n_wtime_trials = self.N_TIME_TRIALS \
+                    if n_wtime_trials is None else n_wtime_trials
+
+            if param_dict is None:
+                raise ValueError(
+                        "Wall time requires dictionary of kernel parameters.")
+
+            stats_found[KernelStatOptions.WALL_TIME] = self.time_kernel(
+                    knl, param_dict, n_warmup_wtime_trials, n_wtime_trials)
+
+        # TODO add other stat options here
+
+        return stats_found
diff --git a/kernel_profiler/version.py b/kernel_profiler/version.py
new file mode 100644
index 000000000..b6a75f587
--- /dev/null
+++ b/kernel_profiler/version.py
@@ -0,0 +1 @@
+VERSION_TEXT = "0.1"
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 000000000..5352cf661
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+numpy
+git+https://github.com/inducer/loopy.git
+git+https://github.com/inducer/pyopencl.git
+git+https://github.com/inducer/pytools.git
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 000000000..4f28d744a
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,6 @@
+[flake8]
+ignore = E126,E127,E128,E123,E226,E241,E242,E265,E402,W503,E731
+max-line-length=85
+exclude=pytools/arithmetic_container.py,pytools/decorator.py
+[wheel]
+universal = 1
diff --git a/setup.py b/setup.py
new file mode 100644
index 000000000..ee8737230
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,28 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from setuptools import setup, find_packages
+
+ver_dic = {}
+version_file = open("kernel_profiler/version.py")
+try:
+    version_file_contents = version_file.read()
+finally:
+    version_file.close()
+
+exec(compile(version_file_contents, "pytools/version.py", 'exec'), ver_dic)
+
+setup(name="kernel_profiler",
+      version=ver_dic["VERSION_TEXT"],
+      description="A kernel profiler",
+      #long_description=open("README.rst", "r").read(),
+
+      install_requires=[
+          "loo.py",
+          ],
+
+      author="James Stevens",
+      #url="http://pypi.python.org/pypi/pytools",
+      author_email="jdsteve2@illinois.edu",
+      license="MIT",
+      packages=find_packages())
-- 
GitLab


From 7df705da52649cae5acb2d99c801c624f166d1dd Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Sun, 27 Jan 2019 20:31:48 -0600
Subject: [PATCH 03/32] added mem ops, math ops, sync ops, and grid sizes to
 stats options in profiler

---
 examples/example.py         |  26 +++-
 kernel_profiler/__init__.py | 247 +++++++++++++++++++++++++++++++++---
 2 files changed, 251 insertions(+), 22 deletions(-)

diff --git a/examples/example.py b/examples/example.py
index ab9cb0353..a7eb14ef2 100644
--- a/examples/example.py
+++ b/examples/example.py
@@ -9,7 +9,10 @@ knl = lp.make_kernel(
         [
             "c[i, j] = sum(k, a[i, k]*b[k, j])"
         ],
-        name="matmul", assumptions="n,m,ell >= 1")
+        name="matmul",
+        assumptions="n,m,ell >= 1",
+        lang_version=(2018, 2),
+        )
 
 knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
 
@@ -26,10 +29,23 @@ ell = 128
 param_dict = {'n': n, 'm': m, 'ell': ell}
 
 kp = KernelProfiler("NVIDIA", "GEFORCE")
-stats = kp.get_stats(knl, [stat_opts.WALL_TIME], param_dict=param_dict)
-print(stats[stat_opts.WALL_TIME])
+stats = kp.profile(
+        knl,
+        [
+            stat_opts.WALL_TIME,
+            stat_opts.MEMORY_ACCESS,
+            stat_opts.ARITHMETIC_OPS,
+            stat_opts.SYNCHRONIZATION,
+            stat_opts.GRID_SIZES,
+        ],
+        param_dict=param_dict)
+print("\nWall time:", stats[stat_opts.WALL_TIME], "\n")
+print(lp.stringify_stats_mapping(stats[stat_opts.MEMORY_ACCESS]))
+print(lp.stringify_stats_mapping(stats[stat_opts.ARITHMETIC_OPS]))
+print(lp.stringify_stats_mapping(stats[stat_opts.SYNCHRONIZATION]))
+print(stats[stat_opts.GRID_SIZES], "\n")
 
 interactive_kp = KernelProfiler(interactive=True)
-interactive_stats = interactive_kp.get_stats(
+interactive_stats = interactive_kp.profile(
         knl, [stat_opts.WALL_TIME], param_dict=param_dict)
-print(interactive_stats[stat_opts.WALL_TIME])
+print(interactive_stats[stat_opts.WALL_TIME], "\n")
diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py
index 919aef41f..3ec544842 100644
--- a/kernel_profiler/__init__.py
+++ b/kernel_profiler/__init__.py
@@ -38,25 +38,43 @@ def find_cl_device_candidates(platform_name, device_name):
 
 class KernelStatOptions:
     WALL_TIME = "wall_time"
+    MEMORY_ACCESS = "memory_access"
+    ARITHMETIC_OPS = "arithmetic_ops"
+    SYNCHRONIZATION = "synchronization"
+    GRID_SIZES = "grid_sizes"
     # TODO add other stat options here
 
 
 class KernelProfiler(object):
 
-    N_WARMUP_TIME_TRIALS = 4
-    N_TIME_TRIALS = 64
-
     def __init__(
                 self,
                 platform_name=None,
                 device_name=None,
                 interactive=False,
+                n_warmup_time_trials=4,
+                n_time_trials=64,
+                evaluate_polys=True,
+                subgroup_size=32,
+                count_redundant_work=True,
+                count_madds=True,
+                count_within_subscripts=False,
                 ):
+
         self.ctx_cache = {}
         self.platform_name = platform_name
         self.device_name = device_name
         self.interactive = interactive
 
+        self.n_warmup_time_trials = n_warmup_time_trials
+        self.n_time_trials = n_time_trials
+
+        self.evaluate_polys = evaluate_polys
+        self.subgroup_size = subgroup_size
+        self.count_redundant_work = count_redundant_work
+        self.count_madds = count_madds
+        self.count_within_subscripts = count_within_subscripts
+
     def get_cl_context(self):
 
         if self.interactive:
@@ -70,8 +88,8 @@ class KernelProfiler(object):
             try:
                 return self.ctx_cache[cache_key]
             except KeyError:
-                ctx = cl.Context(
-                        [find_cl_device_candidates(self.platform_name, self.device_name)[-1]]
+                ctx = cl.Context([find_cl_device_candidates(
+                        self.platform_name, self.device_name)[-1]]
                         )
                 self.ctx_cache[cache_key] = ctx
                 return ctx
@@ -84,9 +102,9 @@ class KernelProfiler(object):
                 n_trials=None,
                 ):
 
-        n_warmup_trials = self.N_WARMUP_TIME_TRIALS if n_warmup_trials is None \
+        n_warmup_trials = self.n_warmup_time_trials if not n_warmup_trials \
                 else n_warmup_trials
-        n_trials = self.N_TIME_TRIALS if n_trials is None else n_trials
+        n_trials = self.n_time_trials if not n_trials else n_trials
 
         ctx = self.get_cl_context()
         queue = cl.CommandQueue(ctx)
@@ -109,22 +127,154 @@ class KernelProfiler(object):
         import numpy as np
         return np.average(wtimes[n_warmup_trials:])
 
-    def get_stats(
-            self,
+    def get_mem_access_stats(
+                self,
+                knl,
+                evaluate_polys=None,
+                param_dict=None,
+                count_redundant_work=None,
+                subgroup_size=None,
+                ):
+
+        from loopy.statistics import get_mem_access_map
+
+        # if no value passed, set to defaults
+        evaluate_polys = self.evaluate_polys \
+                if not evaluate_polys else evaluate_polys
+        count_redundant_work = self.count_redundant_work \
+                if not count_redundant_work else count_redundant_work
+        subgroup_size = self.subgroup_size if not subgroup_size else subgroup_size
+
+        mem_access_map = get_mem_access_map(
+                knl,
+                count_redundant_work=count_redundant_work,
+                subgroup_size=subgroup_size,
+                )
+
+        if evaluate_polys:
+            if param_dict is None:
+                raise ValueError("Cannont evaluate polynomials without param_dict.")
+            return mem_access_map.eval(param_dict)
+        else:
+            return mem_access_map
+
+    def get_op_stats(
+                self,
+                knl,
+                evaluate_polys=None,
+                param_dict=None,
+                count_redundant_work=None,
+                subgroup_size=None,
+                count_madds=None,
+                count_within_subscripts=None,
+                ):
+
+        from loopy.statistics import get_op_map
+
+        # if no value passed, set to defaults
+        evaluate_polys = self.evaluate_polys \
+                if not evaluate_polys else evaluate_polys
+        count_redundant_work = self.count_redundant_work \
+                if not count_redundant_work else count_redundant_work
+        subgroup_size = self.subgroup_size if not subgroup_size else subgroup_size
+        count_madds = self.count_madds if not count_madds else count_madds
+        count_within_subscripts = self.count_within_subscripts \
+                if not count_within_subscripts else count_within_subscripts
+
+        op_map = get_op_map(
             knl,
-            stat_options=[],
-            param_dict=None,
-            n_warmup_wtime_trials=None,
-            n_wtime_trials=None,
-            ):
+            count_redundant_work=count_redundant_work,
+            count_within_subscripts=count_within_subscripts,
+            subgroup_size=subgroup_size,
+            count_madds=count_madds,
+            )
+
+        if evaluate_polys:
+            if param_dict is None:
+                raise ValueError("Cannont evaluate polynomials without param_dict.")
+            return op_map.eval(param_dict)
+        else:
+            return op_map
+
+    def get_synchronization_stats(
+                self,
+                knl,
+                evaluate_polys=None,
+                param_dict=None,
+                subgroup_size=None,
+                ):
+
+        from loopy.statistics import get_synchronization_map
+
+        # if no value passed, set to defaults
+        evaluate_polys = self.evaluate_polys \
+                if not evaluate_polys else evaluate_polys
+        subgroup_size = self.subgroup_size if not subgroup_size else subgroup_size
+
+        sync_map = get_synchronization_map(
+            knl,
+            subgroup_size=subgroup_size,
+            )
+
+        if evaluate_polys:
+            if param_dict is None:
+                raise ValueError("Cannont evaluate polynomials without param_dict.")
+            return sync_map.eval(param_dict)
+        else:
+            return sync_map
+
+    def get_grid_sizes(
+                self,
+                knl,
+                evaluate_polys=None,
+                param_dict=None,
+                ):
+
+        # if no value passed, set to defaults
+        evaluate_polys = self.evaluate_polys \
+                if not evaluate_polys else evaluate_polys
+
+        global_size, local_size = knl.get_grid_size_upper_bounds()
+
+        from islpy import PwQPolynomial
+        gsize_pwqs = []
+        lsize_pwqs = []
+        for gsize in global_size:
+            gsize_pwqs.append(PwQPolynomial.from_pw_aff(gsize))
+        for lsize in local_size:
+            lsize_pwqs.append(PwQPolynomial.from_pw_aff(lsize))
+
+        if evaluate_polys:
+            if param_dict is None:
+                raise ValueError("Cannont evaluate polynomials without param_dict.")
+            return [g.eval_with_dict(param_dict) for g in gsize_pwqs], \
+                   [l.eval_with_dict(param_dict) for l in lsize_pwqs]
+        else:
+            return gsize_pwqs, lsize_pwqs
+
+    def profile(
+                self,
+                knl,
+                stat_options=[],
+                param_dict=None,
+                n_warmup_wtime_trials=None,
+                n_wtime_trials=None,
+                evaluate_polys=True,
+                count_redundant_work=None,
+                subgroup_size=None,
+                count_madds=True,
+                count_within_subscripts=False,
+                ):
 
         stats_found = {}
 
         if KernelStatOptions.WALL_TIME in stat_options:
 
-            n_warmup_wtime_trials = self.N_WARMUP_TIME_TRIALS \
+            # if no value passed, set to defaults
+            #TODO these checks are redundant
+            n_warmup_wtime_trials = self.n_warmup_time_trials \
                     if n_warmup_wtime_trials is None else n_warmup_wtime_trials
-            n_wtime_trials = self.N_TIME_TRIALS \
+            n_wtime_trials = self.n_time_trials \
                     if n_wtime_trials is None else n_wtime_trials
 
             if param_dict is None:
@@ -134,6 +284,69 @@ class KernelProfiler(object):
             stats_found[KernelStatOptions.WALL_TIME] = self.time_kernel(
                     knl, param_dict, n_warmup_wtime_trials, n_wtime_trials)
 
-        # TODO add other stat options here
+        if KernelStatOptions.MEMORY_ACCESS in stat_options:
+            # if no value passed, set to defaults
+            evaluate_polys = self.evaluate_polys \
+                    if not evaluate_polys else evaluate_polys
+            count_redundant_work = self.count_redundant_work \
+                    if not count_redundant_work else count_redundant_work
+            subgroup_size = self.subgroup_size \
+                    if not subgroup_size else subgroup_size
+
+            stats_found[KernelStatOptions.MEMORY_ACCESS] = self.get_mem_access_stats(
+                    knl,
+                    evaluate_polys=evaluate_polys,
+                    param_dict=param_dict,
+                    count_redundant_work=count_redundant_work,
+                    subgroup_size=subgroup_size,
+                    )
+
+        if KernelStatOptions.ARITHMETIC_OPS in stat_options:
+            # if no value passed, set to defaults
+            evaluate_polys = self.evaluate_polys \
+                    if not evaluate_polys else evaluate_polys
+            count_redundant_work = self.count_redundant_work \
+                    if not count_redundant_work else count_redundant_work
+            subgroup_size = self.subgroup_size \
+                    if not subgroup_size else subgroup_size
+            count_madds = self.count_madds if not count_madds else count_madds
+            count_within_subscripts = self.count_within_subscripts \
+                    if not count_within_subscripts else count_within_subscripts
+
+            stats_found[KernelStatOptions.ARITHMETIC_OPS] = self.get_op_stats(
+                    knl,
+                    evaluate_polys=evaluate_polys,
+                    param_dict=param_dict,
+                    count_redundant_work=count_redundant_work,
+                    subgroup_size=subgroup_size,
+                    count_madds=count_madds,
+                    count_within_subscripts=count_within_subscripts,
+                    )
+
+        if KernelStatOptions.SYNCHRONIZATION in stat_options:
+            # if no value passed, set to defaults
+            evaluate_polys = self.evaluate_polys \
+                    if not evaluate_polys else evaluate_polys
+            subgroup_size = self.subgroup_size \
+                    if not subgroup_size else subgroup_size
+
+            stats_found[KernelStatOptions.SYNCHRONIZATION] = \
+                    self.get_synchronization_stats(
+                    knl,
+                    evaluate_polys=evaluate_polys,
+                    param_dict=param_dict,
+                    subgroup_size=subgroup_size,
+                    )
+
+        if KernelStatOptions.GRID_SIZES in stat_options:
+            # if no value passed, set to defaults
+            evaluate_polys = self.evaluate_polys \
+                    if not evaluate_polys else evaluate_polys
+
+            stats_found[KernelStatOptions.GRID_SIZES] = self.get_grid_sizes(
+                    knl,
+                    evaluate_polys=evaluate_polys,
+                    param_dict=param_dict,
+                    )
 
         return stats_found
-- 
GitLab


From c1b5b03182a4cdbab3897c89ab25e34c8de8cd22 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Sun, 27 Jan 2019 20:58:10 -0600
Subject: [PATCH 04/32] removed redundant parameter checks, instead using
 instance variables and updating instance vars when requested

---
 kernel_profiler/__init__.py | 149 +++++++++---------------------------
 1 file changed, 38 insertions(+), 111 deletions(-)

diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py
index 3ec544842..0d3048119 100644
--- a/kernel_profiler/__init__.py
+++ b/kernel_profiler/__init__.py
@@ -98,13 +98,11 @@ class KernelProfiler(object):
                 self,
                 knl,
                 param_dict,
-                n_warmup_trials=None,
-                n_trials=None,
                 ):
 
-        n_warmup_trials = self.n_warmup_time_trials if not n_warmup_trials \
-                else n_warmup_trials
-        n_trials = self.n_time_trials if not n_trials else n_trials
+        if param_dict is None:
+            raise ValueError(
+                    "Wall time requires dictionary of kernel parameters.")
 
         ctx = self.get_cl_context()
         queue = cl.CommandQueue(ctx)
@@ -116,7 +114,7 @@ class KernelProfiler(object):
         wtimes = []
 
         import time
-        for t in range(n_trials + n_warmup_trials):
+        for t in range(self.n_time_trials + self.n_warmup_time_trials):
             queue.finish()
             tstart = time.time()
             evt, out = compiled(queue, **arg_arrays)
@@ -125,33 +123,23 @@ class KernelProfiler(object):
             wtimes.append(tend-tstart)
 
         import numpy as np
-        return np.average(wtimes[n_warmup_trials:])
+        return np.average(wtimes[self.n_warmup_time_trials:])
 
     def get_mem_access_stats(
                 self,
                 knl,
-                evaluate_polys=None,
                 param_dict=None,
-                count_redundant_work=None,
-                subgroup_size=None,
                 ):
 
         from loopy.statistics import get_mem_access_map
 
-        # if no value passed, set to defaults
-        evaluate_polys = self.evaluate_polys \
-                if not evaluate_polys else evaluate_polys
-        count_redundant_work = self.count_redundant_work \
-                if not count_redundant_work else count_redundant_work
-        subgroup_size = self.subgroup_size if not subgroup_size else subgroup_size
-
         mem_access_map = get_mem_access_map(
                 knl,
-                count_redundant_work=count_redundant_work,
-                subgroup_size=subgroup_size,
+                count_redundant_work=self.count_redundant_work,
+                subgroup_size=self.subgroup_size,
                 )
 
-        if evaluate_polys:
+        if self.evaluate_polys:
             if param_dict is None:
                 raise ValueError("Cannont evaluate polynomials without param_dict.")
             return mem_access_map.eval(param_dict)
@@ -161,35 +149,20 @@ class KernelProfiler(object):
     def get_op_stats(
                 self,
                 knl,
-                evaluate_polys=None,
                 param_dict=None,
-                count_redundant_work=None,
-                subgroup_size=None,
-                count_madds=None,
-                count_within_subscripts=None,
                 ):
 
         from loopy.statistics import get_op_map
 
-        # if no value passed, set to defaults
-        evaluate_polys = self.evaluate_polys \
-                if not evaluate_polys else evaluate_polys
-        count_redundant_work = self.count_redundant_work \
-                if not count_redundant_work else count_redundant_work
-        subgroup_size = self.subgroup_size if not subgroup_size else subgroup_size
-        count_madds = self.count_madds if not count_madds else count_madds
-        count_within_subscripts = self.count_within_subscripts \
-                if not count_within_subscripts else count_within_subscripts
-
         op_map = get_op_map(
             knl,
-            count_redundant_work=count_redundant_work,
-            count_within_subscripts=count_within_subscripts,
-            subgroup_size=subgroup_size,
-            count_madds=count_madds,
+            count_redundant_work=self.count_redundant_work,
+            count_within_subscripts=self.count_within_subscripts,
+            subgroup_size=self.subgroup_size,
+            count_madds=self.count_madds,
             )
 
-        if evaluate_polys:
+        if self.evaluate_polys:
             if param_dict is None:
                 raise ValueError("Cannont evaluate polynomials without param_dict.")
             return op_map.eval(param_dict)
@@ -199,24 +172,17 @@ class KernelProfiler(object):
     def get_synchronization_stats(
                 self,
                 knl,
-                evaluate_polys=None,
                 param_dict=None,
-                subgroup_size=None,
                 ):
 
         from loopy.statistics import get_synchronization_map
 
-        # if no value passed, set to defaults
-        evaluate_polys = self.evaluate_polys \
-                if not evaluate_polys else evaluate_polys
-        subgroup_size = self.subgroup_size if not subgroup_size else subgroup_size
-
         sync_map = get_synchronization_map(
             knl,
-            subgroup_size=subgroup_size,
+            subgroup_size=self.subgroup_size,
             )
 
-        if evaluate_polys:
+        if self.evaluate_polys:
             if param_dict is None:
                 raise ValueError("Cannont evaluate polynomials without param_dict.")
             return sync_map.eval(param_dict)
@@ -226,14 +192,9 @@ class KernelProfiler(object):
     def get_grid_sizes(
                 self,
                 knl,
-                evaluate_polys=None,
                 param_dict=None,
                 ):
 
-        # if no value passed, set to defaults
-        evaluate_polys = self.evaluate_polys \
-                if not evaluate_polys else evaluate_polys
-
         global_size, local_size = knl.get_grid_size_upper_bounds()
 
         from islpy import PwQPolynomial
@@ -244,7 +205,7 @@ class KernelProfiler(object):
         for lsize in local_size:
             lsize_pwqs.append(PwQPolynomial.from_pw_aff(lsize))
 
-        if evaluate_polys:
+        if self.evaluate_polys:
             if param_dict is None:
                 raise ValueError("Cannont evaluate polynomials without param_dict.")
             return [g.eval_with_dict(param_dict) for g in gsize_pwqs], \
@@ -259,93 +220,59 @@ class KernelProfiler(object):
                 param_dict=None,
                 n_warmup_wtime_trials=None,
                 n_wtime_trials=None,
-                evaluate_polys=True,
+                evaluate_polys=None,
                 count_redundant_work=None,
                 subgroup_size=None,
-                count_madds=True,
-                count_within_subscripts=False,
+                count_madds=None,
+                count_within_subscripts=None,
                 ):
 
+        # update instance vars if requested
+        if n_warmup_wtime_trials is not None:
+            self.n_warmup_wtime_trials = n_warmup_wtime_trials
+        if n_wtime_trials is not None:
+            self.n_wtime_trials = n_wtime_trials
+        if evaluate_polys is not None:
+            self.evaluate_polys = evaluate_polys
+        if count_redundant_work is not None:
+            self.count_redundant_work = count_redundant_work
+        if subgroup_size is not None:
+            self.subgroup_size = subgroup_size
+        if count_madds is not None:
+            self.count_madds = count_madds
+        if count_within_subscripts is not None:
+            self.count_within_subscripts = count_within_subscripts
+
         stats_found = {}
 
         if KernelStatOptions.WALL_TIME in stat_options:
-
-            # if no value passed, set to defaults
-            #TODO these checks are redundant
-            n_warmup_wtime_trials = self.n_warmup_time_trials \
-                    if n_warmup_wtime_trials is None else n_warmup_wtime_trials
-            n_wtime_trials = self.n_time_trials \
-                    if n_wtime_trials is None else n_wtime_trials
-
-            if param_dict is None:
-                raise ValueError(
-                        "Wall time requires dictionary of kernel parameters.")
-
             stats_found[KernelStatOptions.WALL_TIME] = self.time_kernel(
-                    knl, param_dict, n_warmup_wtime_trials, n_wtime_trials)
+                    knl,
+                    param_dict,
+                    )
 
         if KernelStatOptions.MEMORY_ACCESS in stat_options:
-            # if no value passed, set to defaults
-            evaluate_polys = self.evaluate_polys \
-                    if not evaluate_polys else evaluate_polys
-            count_redundant_work = self.count_redundant_work \
-                    if not count_redundant_work else count_redundant_work
-            subgroup_size = self.subgroup_size \
-                    if not subgroup_size else subgroup_size
-
             stats_found[KernelStatOptions.MEMORY_ACCESS] = self.get_mem_access_stats(
                     knl,
-                    evaluate_polys=evaluate_polys,
                     param_dict=param_dict,
-                    count_redundant_work=count_redundant_work,
-                    subgroup_size=subgroup_size,
                     )
 
         if KernelStatOptions.ARITHMETIC_OPS in stat_options:
-            # if no value passed, set to defaults
-            evaluate_polys = self.evaluate_polys \
-                    if not evaluate_polys else evaluate_polys
-            count_redundant_work = self.count_redundant_work \
-                    if not count_redundant_work else count_redundant_work
-            subgroup_size = self.subgroup_size \
-                    if not subgroup_size else subgroup_size
-            count_madds = self.count_madds if not count_madds else count_madds
-            count_within_subscripts = self.count_within_subscripts \
-                    if not count_within_subscripts else count_within_subscripts
-
             stats_found[KernelStatOptions.ARITHMETIC_OPS] = self.get_op_stats(
                     knl,
-                    evaluate_polys=evaluate_polys,
                     param_dict=param_dict,
-                    count_redundant_work=count_redundant_work,
-                    subgroup_size=subgroup_size,
-                    count_madds=count_madds,
-                    count_within_subscripts=count_within_subscripts,
                     )
 
         if KernelStatOptions.SYNCHRONIZATION in stat_options:
-            # if no value passed, set to defaults
-            evaluate_polys = self.evaluate_polys \
-                    if not evaluate_polys else evaluate_polys
-            subgroup_size = self.subgroup_size \
-                    if not subgroup_size else subgroup_size
-
             stats_found[KernelStatOptions.SYNCHRONIZATION] = \
                     self.get_synchronization_stats(
                     knl,
-                    evaluate_polys=evaluate_polys,
                     param_dict=param_dict,
-                    subgroup_size=subgroup_size,
                     )
 
         if KernelStatOptions.GRID_SIZES in stat_options:
-            # if no value passed, set to defaults
-            evaluate_polys = self.evaluate_polys \
-                    if not evaluate_polys else evaluate_polys
-
             stats_found[KernelStatOptions.GRID_SIZES] = self.get_grid_sizes(
                     knl,
-                    evaluate_polys=evaluate_polys,
                     param_dict=param_dict,
                     )
 
-- 
GitLab


From 8e8a980ff3c655bf5b03a862b63a1a6aa4e45a59 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Sun, 27 Jan 2019 23:16:30 -0600
Subject: [PATCH 05/32] added flop rate and bandwidth to profiler stat options

---
 examples/example.py         | 40 ++++++++++++++++------------
 kernel_profiler/__init__.py | 52 +++++++++++++++++++++++++++----------
 requirements.txt            |  1 +
 3 files changed, 62 insertions(+), 31 deletions(-)

diff --git a/examples/example.py b/examples/example.py
index a7eb14ef2..3a18c0aa2 100644
--- a/examples/example.py
+++ b/examples/example.py
@@ -1,7 +1,7 @@
 import loopy as lp
 import numpy as np
 from kernel_profiler import KernelProfiler
-from kernel_profiler import KernelStatOptions as stat_opts
+from kernel_profiler import KernelStatOptions as kso
 
 
 knl = lp.make_kernel(
@@ -23,29 +23,35 @@ knl = lp.split_iname(knl, "k", lsize)
 knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
 knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")
 
-n = 512
-m = 256
-ell = 128
+n = 2**10
+m = 2**11
+ell = 2**12
 param_dict = {'n': n, 'm': m, 'ell': ell}
 
 kp = KernelProfiler("NVIDIA", "GEFORCE")
 stats = kp.profile(
         knl,
         [
-            stat_opts.WALL_TIME,
-            stat_opts.MEMORY_ACCESS,
-            stat_opts.ARITHMETIC_OPS,
-            stat_opts.SYNCHRONIZATION,
-            stat_opts.GRID_SIZES,
+            kso.WALL_TIME,
+            kso.MEM_ACCESS_MAP,
+            kso.OP_MAP,
+            kso.SYNC_MAP,
+            kso.GRID_SIZES,
+            kso.FLOP_RATE,
+            kso.MEM_BANDWIDTH,
         ],
-        param_dict=param_dict)
-print("\nWall time:", stats[stat_opts.WALL_TIME], "\n")
-print(lp.stringify_stats_mapping(stats[stat_opts.MEMORY_ACCESS]))
-print(lp.stringify_stats_mapping(stats[stat_opts.ARITHMETIC_OPS]))
-print(lp.stringify_stats_mapping(stats[stat_opts.SYNCHRONIZATION]))
-print(stats[stat_opts.GRID_SIZES], "\n")
+        param_dict=param_dict,
+        evaluate_polys=False,
+        )
+print("\nWall time:", stats[kso.WALL_TIME], "\n")
+print(lp.stringify_stats_mapping(stats[kso.MEM_ACCESS_MAP]))
+print(lp.stringify_stats_mapping(stats[kso.OP_MAP]))
+print(lp.stringify_stats_mapping(stats[kso.SYNC_MAP]))
+print(stats[kso.GRID_SIZES], "\n")
+print(stats[kso.FLOP_RATE], "\n")
+print(stats[kso.MEM_BANDWIDTH], "\n")
 
 interactive_kp = KernelProfiler(interactive=True)
 interactive_stats = interactive_kp.profile(
-        knl, [stat_opts.WALL_TIME], param_dict=param_dict)
-print(interactive_stats[stat_opts.WALL_TIME], "\n")
+        knl, [kso.WALL_TIME], param_dict=param_dict)
+print(interactive_stats[kso.WALL_TIME], "\n")
diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py
index 0d3048119..06ee71a3d 100644
--- a/kernel_profiler/__init__.py
+++ b/kernel_profiler/__init__.py
@@ -38,11 +38,12 @@ def find_cl_device_candidates(platform_name, device_name):
 
 class KernelStatOptions:
     WALL_TIME = "wall_time"
-    MEMORY_ACCESS = "memory_access"
-    ARITHMETIC_OPS = "arithmetic_ops"
-    SYNCHRONIZATION = "synchronization"
+    MEM_ACCESS_MAP = "mem_access_map"
+    OP_MAP = "op_map"
+    SYNC_MAP = "sync_map"
     GRID_SIZES = "grid_sizes"
-    # TODO add other stat options here
+    FLOP_RATE = "flop_rate"
+    MEM_BANDWIDTH = "mem_bandwidth"
 
 
 class KernelProfiler(object):
@@ -244,36 +245,59 @@ class KernelProfiler(object):
             self.count_within_subscripts = count_within_subscripts
 
         stats_found = {}
+        kso = KernelStatOptions
 
-        if KernelStatOptions.WALL_TIME in stat_options:
-            stats_found[KernelStatOptions.WALL_TIME] = self.time_kernel(
+        if kso.WALL_TIME in stat_options or \
+                kso.FLOP_RATE in stat_options or \
+                kso.MEM_BANDWIDTH in stat_options:
+            stats_found[kso.WALL_TIME] = self.time_kernel(
                     knl,
                     param_dict,
                     )
 
-        if KernelStatOptions.MEMORY_ACCESS in stat_options:
-            stats_found[KernelStatOptions.MEMORY_ACCESS] = self.get_mem_access_stats(
+        if kso.MEM_ACCESS_MAP in stat_options or \
+                kso.MEM_BANDWIDTH in stat_options:
+            stats_found[kso.MEM_ACCESS_MAP] = self.get_mem_access_stats(
                     knl,
                     param_dict=param_dict,
                     )
 
-        if KernelStatOptions.ARITHMETIC_OPS in stat_options:
-            stats_found[KernelStatOptions.ARITHMETIC_OPS] = self.get_op_stats(
+        if kso.OP_MAP in stat_options or \
+                kso.FLOP_RATE in stat_options:
+            stats_found[kso.OP_MAP] = self.get_op_stats(
                     knl,
                     param_dict=param_dict,
                     )
 
-        if KernelStatOptions.SYNCHRONIZATION in stat_options:
-            stats_found[KernelStatOptions.SYNCHRONIZATION] = \
+        if kso.SYNC_MAP in stat_options:
+            stats_found[kso.SYNC_MAP] = \
                     self.get_synchronization_stats(
                     knl,
                     param_dict=param_dict,
                     )
 
-        if KernelStatOptions.GRID_SIZES in stat_options:
-            stats_found[KernelStatOptions.GRID_SIZES] = self.get_grid_sizes(
+        if kso.GRID_SIZES in stat_options:
+            stats_found[kso.GRID_SIZES] = self.get_grid_sizes(
                     knl,
                     param_dict=param_dict,
                     )
 
+        if kso.FLOP_RATE in stat_options:
+            import numpy as np
+            float_ops = stats_found[kso.OP_MAP].filter_by(
+                    dtype=[np.float32, np.float64]
+                    ).sum()
+            if not self.evaluate_polys:
+                float_ops = float_ops.eval_with_dict(param_dict)
+            stats_found[kso.FLOP_RATE] = float_ops/stats_found[kso.WALL_TIME]
+
+        if kso.MEM_BANDWIDTH in stat_options:
+            data_moved_bytes = stats_found[kso.MEM_ACCESS_MAP].filter_by(
+                    mtype=["global"]
+                    ).to_bytes().sum()
+            if not self.evaluate_polys:
+                data_moved_bytes = data_moved_bytes.eval_with_dict(param_dict)
+            stats_found[kso.MEM_BANDWIDTH] = \
+                    data_moved_bytes/stats_found[kso.WALL_TIME]
+
         return stats_found
diff --git a/requirements.txt b/requirements.txt
index 5352cf661..8482d2c84 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 numpy
+git+https://github.com/inducer/islpy.git
 git+https://github.com/inducer/loopy.git
 git+https://github.com/inducer/pyopencl.git
 git+https://github.com/inducer/pytools.git
-- 
GitLab


From b55c066ff5ccd37fd13427b1d28ce6736bc6be00 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 28 Jan 2019 11:38:49 -0600
Subject: [PATCH 06/32] minor change to example

---
 examples/example.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/example.py b/examples/example.py
index 3a18c0aa2..ddc44bc2a 100644
--- a/examples/example.py
+++ b/examples/example.py
@@ -25,7 +25,7 @@ knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")
 
 n = 2**10
 m = 2**11
-ell = 2**12
+ell = 2**9
 param_dict = {'n': n, 'm': m, 'ell': ell}
 
 kp = KernelProfiler("NVIDIA", "GEFORCE")
@@ -41,7 +41,7 @@ stats = kp.profile(
             kso.MEM_BANDWIDTH,
         ],
         param_dict=param_dict,
-        evaluate_polys=False,
+        evaluate_polys=True,
         )
 print("\nWall time:", stats[kso.WALL_TIME], "\n")
 print(lp.stringify_stats_mapping(stats[kso.MEM_ACCESS_MAP]))
-- 
GitLab


From f69fab3d8f0bb43897a37d6495be984cab48c835 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 28 Jan 2019 11:46:01 -0600
Subject: [PATCH 07/32] install instructions in readme

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index 846211dcb..addc1c34c 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,9 @@
 # kernel_profiler
 
+Install:
+
+`python setup.py install`
+
+Developer install (source changes take immediate effect):
+
+`python setup.py develop`
-- 
GitLab


From 16cef0d159e829be19be4c8de0bcfe50470a6d9c Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 28 Jan 2019 12:04:58 -0600
Subject: [PATCH 08/32] explained stats options in readme

---
 README.md | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index addc1c34c..5f83baabb 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,32 @@
 # kernel_profiler
 
-Install:
+**Install**:
 
 `python setup.py install`
 
-Developer install (source changes take immediate effect):
+**Developer install** (source changes take immediate effect):
 
 `python setup.py develop`
+
+# Stat options
+
+* **KernelStatOptions.WALL_TIME**  
+   Kernel execution time using random input data.
+
+* **KernelStatOptions.MEM_ACCESS_MAP**  
+   A [**loopy.ToCountMap**](https://documen.tician.de/loopy/ref_other.html#loopy.ToCountMap) mapping memory accesses to counts. Also see [**loopy.get_mem_access_map**](https://documen.tician.de/loopy/ref_other.html#loopy.get_mem_access_map).
+
+* **KernelStatOptions.OP_MAP**  
+   A [**loopy.ToCountMap**](https://documen.tician.de/loopy/ref_other.html#loopy.ToCountMap) mapping operations to counts. Also see [**loopy.get_op_map**](https://documen.tician.de/loopy/ref_other.html#loopy.get_op_map).
+
+* **KernelStatOptions.SYNC_MAP**  
+   A [**loopy.ToCountMap**](https://documen.tician.de/loopy/ref_other.html#loopy.ToCountMap) mapping synchronization operations to counts. Also see [**loopy.get_synchronization_map**](https://documen.tician.de/loopy/ref_other.html#loopy.get_synchronization_map).
+
+* **KernelStatOptions.GRID_SIZES**  
+   A tuple containing (local sizes, global sizes).
+
+* **KernelStatOptions.FLOP_RATE**  
+   Number of 32-bit and 64-bit floating point operations per second.
+
+* **KernelStatOptions.MEM_BANDWIDTH**  
+   Global memory bytes accessed per second.
-- 
GitLab


From 8a680829aaa2f91b2f42d20c42a9fb8eee712acd Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Wed, 30 Jan 2019 16:15:55 -0600
Subject: [PATCH 09/32] added save_ptx option

---
 examples/example.py         |  3 ++-
 kernel_profiler/__init__.py | 42 +++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/examples/example.py b/examples/example.py
index ddc44bc2a..0231089f9 100644
--- a/examples/example.py
+++ b/examples/example.py
@@ -28,7 +28,7 @@ m = 2**11
 ell = 2**9
 param_dict = {'n': n, 'm': m, 'ell': ell}
 
-kp = KernelProfiler("NVIDIA", "GEFORCE")
+kp = KernelProfiler("NVIDIA", "GEFORCE", include_kernel_params_in_ptx_filename=True)
 stats = kp.profile(
         knl,
         [
@@ -39,6 +39,7 @@ stats = kp.profile(
             kso.GRID_SIZES,
             kso.FLOP_RATE,
             kso.MEM_BANDWIDTH,
+            kso.SAVE_PTX,
         ],
         param_dict=param_dict,
         evaluate_polys=True,
diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py
index 06ee71a3d..67a8a40e7 100644
--- a/kernel_profiler/__init__.py
+++ b/kernel_profiler/__init__.py
@@ -36,6 +36,17 @@ def find_cl_device_candidates(platform_name, device_name):
     return candidates
 
 
+def write_ptx(ctx, knl, filename=None):
+    cl_program = cl.Program(
+                            ctx, lp.generate_code_v2(knl).device_code()
+                           ).build(options=knl.options.cl_build_options)
+    ptx_src = cl_program.binaries[0]
+    if not filename:
+        filename = "ptx_"+knl.name+".ptx"
+    ptx_src_file = open(filename, 'w')
+    ptx_src_file.write(ptx_src.decode('utf-8', 'ignore'))
+
+
 class KernelStatOptions:
     WALL_TIME = "wall_time"
     MEM_ACCESS_MAP = "mem_access_map"
@@ -44,6 +55,7 @@ class KernelStatOptions:
     GRID_SIZES = "grid_sizes"
     FLOP_RATE = "flop_rate"
     MEM_BANDWIDTH = "mem_bandwidth"
+    SAVE_PTX = "save_ptx"
 
 
 class KernelProfiler(object):
@@ -60,6 +72,7 @@ class KernelProfiler(object):
                 count_redundant_work=True,
                 count_madds=True,
                 count_within_subscripts=False,
+                include_kernel_params_in_ptx_filename=False,
                 ):
 
         self.ctx_cache = {}
@@ -76,6 +89,9 @@ class KernelProfiler(object):
         self.count_madds = count_madds
         self.count_within_subscripts = count_within_subscripts
 
+        self.include_kernel_params_in_ptx_filename = \
+                include_kernel_params_in_ptx_filename
+
     def get_cl_context(self):
 
         if self.interactive:
@@ -126,6 +142,23 @@ class KernelProfiler(object):
         import numpy as np
         return np.average(wtimes[self.n_warmup_time_trials:])
 
+    def save_ptx(
+            self,
+            knl,
+            param_dict=None,
+            ):
+
+        if self.include_kernel_params_in_ptx_filename:
+            write_ptx(
+                    self.get_cl_context(),
+                    knl,
+                    filename="ptx_"+knl.name+"_"+"_".join(
+                        ["%s%d" % (p, v) for p, v in param_dict.items()]
+                        )+".ptx"
+                    )
+        else:
+            write_ptx(self.get_cl_context(), knl)
+
     def get_mem_access_stats(
                 self,
                 knl,
@@ -226,9 +259,12 @@ class KernelProfiler(object):
                 subgroup_size=None,
                 count_madds=None,
                 count_within_subscripts=None,
+                include_kernel_params_in_ptx_filename=None,
                 ):
 
         # update instance vars if requested
+        # TODO don't change instance variables, don't allow options changes here,
+        # instead, make a change_profile_options function
         if n_warmup_wtime_trials is not None:
             self.n_warmup_wtime_trials = n_warmup_wtime_trials
         if n_wtime_trials is not None:
@@ -243,6 +279,9 @@ class KernelProfiler(object):
             self.count_madds = count_madds
         if count_within_subscripts is not None:
             self.count_within_subscripts = count_within_subscripts
+        if include_kernel_params_in_ptx_filename is not None:
+            self.include_kernel_params_in_ptx_filename = \
+                    include_kernel_params_in_ptx_filename
 
         stats_found = {}
         kso = KernelStatOptions
@@ -300,4 +339,7 @@ class KernelProfiler(object):
             stats_found[kso.MEM_BANDWIDTH] = \
                     data_moved_bytes/stats_found[kso.WALL_TIME]
 
+        if kso.SAVE_PTX in stat_options:
+            self.save_ptx(knl, param_dict)
+
         return stats_found
-- 
GitLab


From b07cd5b03d6c00a88734642ba9047b20eb1782be Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Wed, 30 Jan 2019 16:25:27 -0600
Subject: [PATCH 10/32] created update_options function to change instance
 variables in profiler, rather than changing them when profiling

---
 examples/example.py         | 11 +++---
 kernel_profiler/__init__.py | 70 ++++++++++++++++++++++---------------
 2 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/examples/example.py b/examples/example.py
index 0231089f9..c3a6cfff8 100644
--- a/examples/example.py
+++ b/examples/example.py
@@ -28,7 +28,10 @@ m = 2**11
 ell = 2**9
 param_dict = {'n': n, 'm': m, 'ell': ell}
 
-kp = KernelProfiler("NVIDIA", "GEFORCE", include_kernel_params_in_ptx_filename=True)
+kp = KernelProfiler("NVIDIA", "GEFORCE",
+        evaluate_polys = True,
+        include_kernel_params_in_ptx_filename=True,
+        )
 stats = kp.profile(
         knl,
         [
@@ -42,7 +45,6 @@ stats = kp.profile(
             kso.SAVE_PTX,
         ],
         param_dict=param_dict,
-        evaluate_polys=True,
         )
 print("\nWall time:", stats[kso.WALL_TIME], "\n")
 print(lp.stringify_stats_mapping(stats[kso.MEM_ACCESS_MAP]))
@@ -52,7 +54,8 @@ print(stats[kso.GRID_SIZES], "\n")
 print(stats[kso.FLOP_RATE], "\n")
 print(stats[kso.MEM_BANDWIDTH], "\n")
 
-interactive_kp = KernelProfiler(interactive=True)
-interactive_stats = interactive_kp.profile(
+kp.update_options(interactive=True)
+
+interactive_stats = kp.profile(
         knl, [kso.WALL_TIME], param_dict=param_dict)
 print(interactive_stats[kso.WALL_TIME], "\n")
diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py
index 67a8a40e7..b0abf3bfd 100644
--- a/kernel_profiler/__init__.py
+++ b/kernel_profiler/__init__.py
@@ -92,6 +92,44 @@ class KernelProfiler(object):
         self.include_kernel_params_in_ptx_filename = \
                 include_kernel_params_in_ptx_filename
 
+    def update_options(
+                self,
+                platform_name=None,
+                device_name=None,
+                interactive=None,
+                n_warmup_wtime_trials=None,
+                n_wtime_trials=None,
+                evaluate_polys=None,
+                count_redundant_work=None,
+                subgroup_size=None,
+                count_madds=None,
+                count_within_subscripts=None,
+                include_kernel_params_in_ptx_filename=None,
+                ):
+        if platform_name is not None:
+            self.platform_name = platform_name
+        if device_name is not None:
+            self.device_name = device_name
+        if interactive is not None:
+            self.interactive = interactive
+        if n_warmup_wtime_trials is not None:
+            self.n_warmup_wtime_trials = n_warmup_wtime_trials
+        if n_wtime_trials is not None:
+            self.n_wtime_trials = n_wtime_trials
+        if evaluate_polys is not None:
+            self.evaluate_polys = evaluate_polys
+        if count_redundant_work is not None:
+            self.count_redundant_work = count_redundant_work
+        if subgroup_size is not None:
+            self.subgroup_size = subgroup_size
+        if count_madds is not None:
+            self.count_madds = count_madds
+        if count_within_subscripts is not None:
+            self.count_within_subscripts = count_within_subscripts
+        if include_kernel_params_in_ptx_filename is not None:
+            self.include_kernel_params_in_ptx_filename = \
+                    include_kernel_params_in_ptx_filename
+
     def get_cl_context(self):
 
         if self.interactive:
@@ -149,6 +187,9 @@ class KernelProfiler(object):
             ):
 
         if self.include_kernel_params_in_ptx_filename:
+            if param_dict is None:
+                raise ValueError("Cannot include kernel params "
+                        "in ptx filename, no param dict passed.")
             write_ptx(
                     self.get_cl_context(),
                     knl,
@@ -252,37 +293,8 @@ class KernelProfiler(object):
                 knl,
                 stat_options=[],
                 param_dict=None,
-                n_warmup_wtime_trials=None,
-                n_wtime_trials=None,
-                evaluate_polys=None,
-                count_redundant_work=None,
-                subgroup_size=None,
-                count_madds=None,
-                count_within_subscripts=None,
-                include_kernel_params_in_ptx_filename=None,
                 ):
 
-        # update instance vars if requested
-        # TODO don't change instance variables, don't allow options changes here,
-        # instead, make a change_profile_options function
-        if n_warmup_wtime_trials is not None:
-            self.n_warmup_wtime_trials = n_warmup_wtime_trials
-        if n_wtime_trials is not None:
-            self.n_wtime_trials = n_wtime_trials
-        if evaluate_polys is not None:
-            self.evaluate_polys = evaluate_polys
-        if count_redundant_work is not None:
-            self.count_redundant_work = count_redundant_work
-        if subgroup_size is not None:
-            self.subgroup_size = subgroup_size
-        if count_madds is not None:
-            self.count_madds = count_madds
-        if count_within_subscripts is not None:
-            self.count_within_subscripts = count_within_subscripts
-        if include_kernel_params_in_ptx_filename is not None:
-            self.include_kernel_params_in_ptx_filename = \
-                    include_kernel_params_in_ptx_filename
-
         stats_found = {}
         kso = KernelStatOptions
 
-- 
GitLab


From ad495b80ac860b6fca804ea01b771500be38e12d Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@quail.cs.illinois.edu>
Date: Wed, 30 Jan 2019 22:22:12 -0600
Subject: [PATCH 11/32] added generated_code as a stat option

---
 examples/example.py         | 7 +++++--
 kernel_profiler/__init__.py | 4 ++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/examples/example.py b/examples/example.py
index c3a6cfff8..28ebf2c29 100644
--- a/examples/example.py
+++ b/examples/example.py
@@ -28,7 +28,9 @@ m = 2**11
 ell = 2**9
 param_dict = {'n': n, 'm': m, 'ell': ell}
 
-kp = KernelProfiler("NVIDIA", "GEFORCE",
+kp = KernelProfiler(
+        #"NVIDIA", "GEFORCE",
+        interactive=True,
         evaluate_polys = True,
         include_kernel_params_in_ptx_filename=True,
         )
@@ -43,6 +45,7 @@ stats = kp.profile(
             kso.FLOP_RATE,
             kso.MEM_BANDWIDTH,
             kso.SAVE_PTX,
+            kso.GENERATED_CODE,
         ],
         param_dict=param_dict,
         )
@@ -54,7 +57,7 @@ print(stats[kso.GRID_SIZES], "\n")
 print(stats[kso.FLOP_RATE], "\n")
 print(stats[kso.MEM_BANDWIDTH], "\n")
 
-kp.update_options(interactive=True)
+kp.update_options(evaluate_polys=False)
 
 interactive_stats = kp.profile(
         knl, [kso.WALL_TIME], param_dict=param_dict)
diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py
index b0abf3bfd..ec13b0914 100644
--- a/kernel_profiler/__init__.py
+++ b/kernel_profiler/__init__.py
@@ -56,6 +56,7 @@ class KernelStatOptions:
     FLOP_RATE = "flop_rate"
     MEM_BANDWIDTH = "mem_bandwidth"
     SAVE_PTX = "save_ptx"
+    GENERATED_CODE = "generated_code"
 
 
 class KernelProfiler(object):
@@ -354,4 +355,7 @@ class KernelProfiler(object):
         if kso.SAVE_PTX in stat_options:
             self.save_ptx(knl, param_dict)
 
+        if kso.GENERATED_CODE in stat_options:
+            stats_found[kso.GENERATED_CODE] = lp.generate_code_v2(knl).device_code()
+
         return stats_found
-- 
GitLab


From 10eb329a4c33c69bf77f397c3eb54109de508a9e Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@quail.cs.illinois.edu>
Date: Wed, 30 Jan 2019 22:40:46 -0600
Subject: [PATCH 12/32] removed interactive setting, instead automatically
 interactive when no platform/device provided

---
 examples/example.py         |  6 +++---
 kernel_profiler/__init__.py | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/example.py b/examples/example.py
index 28ebf2c29..c9db9ebe5 100644
--- a/examples/example.py
+++ b/examples/example.py
@@ -30,7 +30,7 @@ param_dict = {'n': n, 'm': m, 'ell': ell}
 
 kp = KernelProfiler(
         #"NVIDIA", "GEFORCE",
-        interactive=True,
+        #"NVIDIA", "K40C",
         evaluate_polys = True,
         include_kernel_params_in_ptx_filename=True,
         )
@@ -59,6 +59,6 @@ print(stats[kso.MEM_BANDWIDTH], "\n")
 
 kp.update_options(evaluate_polys=False)
 
-interactive_stats = kp.profile(
+stats = kp.profile(
         knl, [kso.WALL_TIME], param_dict=param_dict)
-print(interactive_stats[kso.WALL_TIME], "\n")
+print(stats[kso.WALL_TIME], "\n")
diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py
index ec13b0914..058966dd2 100644
--- a/kernel_profiler/__init__.py
+++ b/kernel_profiler/__init__.py
@@ -133,13 +133,13 @@ class KernelProfiler(object):
 
     def get_cl_context(self):
 
-        if self.interactive:
-            return cl.create_some_context()
+        if self.platform_name is None or self.device_name is None:
+            ctx = cl.create_some_context()
+            self.platform_name = ctx.devices[0].platform.name
+            self.device_name = ctx.devices[0].name
+            self.ctx_cache[(self.platform_name, self.device_name, "ctx")] = ctx
+            return ctx
         else:
-            if self.platform_name is None or self.device_name is None:
-                raise ValueError(
-                        "Wall time requires platform name, and device name.")
-
             cache_key = (self.platform_name, self.device_name, "ctx")
             try:
                 return self.ctx_cache[cache_key]
-- 
GitLab


From 12e4c12855b407fbb8e2994beee3ec43b0219a28 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@quail.cs.illinois.edu>
Date: Wed, 30 Jan 2019 22:56:49 -0600
Subject: [PATCH 13/32] allowing ptx filename suffix

---
 kernel_profiler/__init__.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py
index 058966dd2..4c0e103b3 100644
--- a/kernel_profiler/__init__.py
+++ b/kernel_profiler/__init__.py
@@ -74,7 +74,9 @@ class KernelProfiler(object):
                 count_madds=True,
                 count_within_subscripts=False,
                 include_kernel_params_in_ptx_filename=False,
+                ptx_filename_suffix="",
                 ):
+        # TODO figure out how to let user specify target w/device
 
         self.ctx_cache = {}
         self.platform_name = platform_name
@@ -92,6 +94,7 @@ class KernelProfiler(object):
 
         self.include_kernel_params_in_ptx_filename = \
                 include_kernel_params_in_ptx_filename
+        self.ptx_filename_suffix = ptx_filename_suffix
 
     def update_options(
                 self,
@@ -106,6 +109,7 @@ class KernelProfiler(object):
                 count_madds=None,
                 count_within_subscripts=None,
                 include_kernel_params_in_ptx_filename=None,
+                ptx_filename_suffix=None,
                 ):
         if platform_name is not None:
             self.platform_name = platform_name
@@ -130,6 +134,8 @@ class KernelProfiler(object):
         if include_kernel_params_in_ptx_filename is not None:
             self.include_kernel_params_in_ptx_filename = \
                     include_kernel_params_in_ptx_filename
+        if ptx_filename_suffix is not None:
+            self.ptx_filename_suffix = ptx_filename_suffix
 
     def get_cl_context(self):
 
@@ -196,7 +202,7 @@ class KernelProfiler(object):
                     knl,
                     filename="ptx_"+knl.name+"_"+"_".join(
                         ["%s%d" % (p, v) for p, v in param_dict.items()]
-                        )+".ptx"
+                        )+self.ptx_filename_suffix+".ptx"
                     )
         else:
             write_ptx(self.get_cl_context(), knl)
-- 
GitLab


From ac5516874cce9724319ec26bda2da9a6b4f9c77b Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@quail.cs.illinois.edu>
Date: Wed, 30 Jan 2019 23:22:47 -0600
Subject: [PATCH 14/32] now when kernel already has a target w/ device, use
 that to create ctx

---
 kernel_profiler/__init__.py | 41 +++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py
index 4c0e103b3..ec113b3a6 100644
--- a/kernel_profiler/__init__.py
+++ b/kernel_profiler/__init__.py
@@ -137,15 +137,44 @@ class KernelProfiler(object):
         if ptx_filename_suffix is not None:
             self.ptx_filename_suffix = ptx_filename_suffix
 
-    def get_cl_context(self):
+    def get_cl_context(self, knl):
+
+        if knl.target is not None and knl.target.device is not None:
+            # kernel has a device already, see if we can use it
+            knl_platform_name = knl.target.device.platform.name
+            knl_device_name = knl.target.device.name
+
+            # check for mismatch between platforms/devices
+            if (self.platform_name is not None
+                    and not self.platform_name in knl_platform_name) or (
+                    self.device_name is not None
+                    and not self.device_name in knl_device_name):
+                raise ValueError("kernel target platform %s and/or device %s do "
+                        "not match profiler platform %s and/or device %s."
+                        % (knl_platform_name, knl_device_name,
+                        self.platform_name, self.device_name))
+
+            cache_key = (knl_platform_name, knl_device_name, "ctx")
+            try:
+                return self.ctx_cache[cache_key]
+            except KeyError:
+                ctx = cl.Context([find_cl_device_candidates(
+                        knl_platform_name, knl_device_name)[-1]]
+                        )
+                self.ctx_cache[cache_key] = ctx
+                return ctx
 
-        if self.platform_name is None or self.device_name is None:
-            ctx = cl.create_some_context()
+        elif self.platform_name is None or self.device_name is None:
+            # kernel does not have a pre-specified device,
+            # and profiler does not know platform+device
+            ctx = cl.create_some_context()  # interactive mode
             self.platform_name = ctx.devices[0].platform.name
             self.device_name = ctx.devices[0].name
             self.ctx_cache[(self.platform_name, self.device_name, "ctx")] = ctx
             return ctx
+
         else:
+            # profiler knows both platform and device already
             cache_key = (self.platform_name, self.device_name, "ctx")
             try:
                 return self.ctx_cache[cache_key]
@@ -166,7 +195,7 @@ class KernelProfiler(object):
             raise ValueError(
                     "Wall time requires dictionary of kernel parameters.")
 
-        ctx = self.get_cl_context()
+        ctx = self.get_cl_context(knl)
         queue = cl.CommandQueue(ctx)
 
         arg_arrays = create_rand_args(ctx, knl, param_dict)
@@ -198,14 +227,14 @@ class KernelProfiler(object):
                 raise ValueError("Cannot include kernel params "
                         "in ptx filename, no param dict passed.")
             write_ptx(
-                    self.get_cl_context(),
+                    self.get_cl_context(knl),
                     knl,
                     filename="ptx_"+knl.name+"_"+"_".join(
                         ["%s%d" % (p, v) for p, v in param_dict.items()]
                         )+self.ptx_filename_suffix+".ptx"
                     )
         else:
-            write_ptx(self.get_cl_context(), knl)
+            write_ptx(self.get_cl_context(knl), knl)
 
     def get_mem_access_stats(
                 self,
-- 
GitLab


From a61c07d94b91aa2585dcdd6a052d627ca9c26eac Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@quail.cs.illinois.edu>
Date: Thu, 31 Jan 2019 01:40:46 -0600
Subject: [PATCH 15/32] counting madds as two ops for flop/s rate

---
 kernel_profiler/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py
index ec113b3a6..453fa9a92 100644
--- a/kernel_profiler/__init__.py
+++ b/kernel_profiler/__init__.py
@@ -371,8 +371,13 @@ class KernelProfiler(object):
 
         if kso.FLOP_RATE in stat_options:
             import numpy as np
+            # count madds as 2 ops
+            # (count all flops once and then count the madds again)
             float_ops = stats_found[kso.OP_MAP].filter_by(
                     dtype=[np.float32, np.float64]
+                    ).sum() + \
+                    stats_found[kso.OP_MAP].filter_by(
+                    dtype=[np.float32, np.float64], name=["madd"]
                     ).sum()
             if not self.evaluate_polys:
                 float_ops = float_ops.eval_with_dict(param_dict)
-- 
GitLab


From 5b330274988b54ff3d4c5e1ad6982c45606f9219 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Thu, 31 Jan 2019 10:39:40 -0600
Subject: [PATCH 16/32] updated readme

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 5f83baabb..0c1e2149c 100644
--- a/README.md
+++ b/README.md
@@ -30,3 +30,9 @@
 
 * **KernelStatOptions.MEM_BANDWIDTH**  
    Global memory bytes accessed per second.
+
+* **KernelStatOptions.GENERATED_CODE**  
+   Generated opencl code.
+
+* **KernelStatOptions.SAVE_PTX**  
+   Save PTX (Portable Thread eXecution) file.
-- 
GitLab


From 339ecaaa28007d6f8190ed899d8d6bd882864bb3 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Thu, 31 Jan 2019 19:28:32 -0600
Subject: [PATCH 17/32] fixing flop counting, flops were only being counted
 once per subgroup

---
 kernel_profiler/__init__.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py
index 453fa9a92..d2b2e00f8 100644
--- a/kernel_profiler/__init__.py
+++ b/kernel_profiler/__init__.py
@@ -373,17 +373,19 @@ class KernelProfiler(object):
             import numpy as np
             # count madds as 2 ops
             # (count all flops once and then count the madds again)
-            float_ops = stats_found[kso.OP_MAP].filter_by(
-                    dtype=[np.float32, np.float64]
-                    ).sum() + \
+            float_ops = self.subgroup_size*(
                     stats_found[kso.OP_MAP].filter_by(
-                    dtype=[np.float32, np.float64], name=["madd"]
-                    ).sum()
+                        dtype=[np.float32, np.float64]
+                        ).sum() +
+                    stats_found[kso.OP_MAP].filter_by(
+                        dtype=[np.float32, np.float64], name=["madd"]
+                        ).sum())
             if not self.evaluate_polys:
                 float_ops = float_ops.eval_with_dict(param_dict)
             stats_found[kso.FLOP_RATE] = float_ops/stats_found[kso.WALL_TIME]
 
         if kso.MEM_BANDWIDTH in stat_options:
+            # TODO check for stride 0 access, only counted once per subgroup
             data_moved_bytes = stats_found[kso.MEM_ACCESS_MAP].filter_by(
                     mtype=["global"]
                     ).to_bytes().sum()
-- 
GitLab


From 642845920ba7bd7560b8ccd3be218c2301d68ce0 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Thu, 31 Jan 2019 22:38:08 -0600
Subject: [PATCH 18/32] accounting for count granularity when computing flops
 and bandwidth

---
 kernel_profiler/__init__.py | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py
index d2b2e00f8..4cbefbed0 100644
--- a/kernel_profiler/__init__.py
+++ b/kernel_profiler/__init__.py
@@ -76,7 +76,6 @@ class KernelProfiler(object):
                 include_kernel_params_in_ptx_filename=False,
                 ptx_filename_suffix="",
                 ):
-        # TODO figure out how to let user specify target w/device
 
         self.ctx_cache = {}
         self.platform_name = platform_name
@@ -146,9 +145,9 @@ class KernelProfiler(object):
 
             # check for mismatch between platforms/devices
             if (self.platform_name is not None
-                    and not self.platform_name in knl_platform_name) or (
+                    and self.platform_name not in knl_platform_name) or (
                     self.device_name is not None
-                    and not self.device_name in knl_device_name):
+                    and self.device_name not in knl_device_name):
                 raise ValueError("kernel target platform %s and/or device %s do "
                         "not match profiler platform %s and/or device %s."
                         % (knl_platform_name, knl_device_name,
@@ -373,21 +372,45 @@ class KernelProfiler(object):
             import numpy as np
             # count madds as 2 ops
             # (count all flops once and then count the madds again)
+
+            # flops counted w/subgroup granularity
             float_ops = self.subgroup_size*(
                     stats_found[kso.OP_MAP].filter_by(
-                        dtype=[np.float32, np.float64]
+                        dtype=[np.float32, np.float64],
+                        count_granularity=[lp.CountGranularity.SUBGROUP],
                         ).sum() +
                     stats_found[kso.OP_MAP].filter_by(
-                        dtype=[np.float32, np.float64], name=["madd"]
+                        dtype=[np.float32, np.float64],
+                        count_granularity=[lp.CountGranularity.SUBGROUP],
+                        name=["madd"]
                         ).sum())
+
+            # flops counted w/workitem granularity (should be zero)
+            float_ops += stats_found[kso.OP_MAP].filter_by(
+                    dtype=[np.float32, np.float64],
+                    count_granularity=[lp.CountGranularity.WORKITEM],
+                    ).sum() + stats_found[kso.OP_MAP].filter_by(
+                    dtype=[np.float32, np.float64],
+                    count_granularity=[lp.CountGranularity.WORKITEM],
+                    name=["madd"]
+                    ).sum()
+            # TODO after ToCountMap gets version of sum that allows
+            # counting w/specified count granularity, update this
+
             if not self.evaluate_polys:
                 float_ops = float_ops.eval_with_dict(param_dict)
             stats_found[kso.FLOP_RATE] = float_ops/stats_found[kso.WALL_TIME]
 
         if kso.MEM_BANDWIDTH in stat_options:
-            # TODO check for stride 0 access, only counted once per subgroup
+            # mem access counted w/subgroup granularity
             data_moved_bytes = stats_found[kso.MEM_ACCESS_MAP].filter_by(
-                    mtype=["global"]
+                    mtype=["global"],
+                    count_granularity=[lp.CountGranularity.SUBGROUP],
+                    ).to_bytes().sum()*self.subgroup_size
+            # mem access counted w/workitem granularity
+            data_moved_bytes += stats_found[kso.MEM_ACCESS_MAP].filter_by(
+                    mtype=["global"],
+                    count_granularity=[lp.CountGranularity.WORKITEM],
                     ).to_bytes().sum()
             if not self.evaluate_polys:
                 data_moved_bytes = data_moved_bytes.eval_with_dict(param_dict)
-- 
GitLab


From b5272b58fb1cdee56aeb7ee2cde77f411fe5bf37 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Sat, 2 Feb 2019 19:32:48 -0600
Subject: [PATCH 19/32] MEM_BANDWIDTH now calculated two ways, once counting
 all global accesses and once counting footprint

---
 examples/example.py         |  6 +++---
 kernel_profiler/__init__.py | 18 ++++++++++++++++--
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/examples/example.py b/examples/example.py
index c9db9ebe5..09c25a7e8 100644
--- a/examples/example.py
+++ b/examples/example.py
@@ -1,7 +1,7 @@
 import loopy as lp
 import numpy as np
 from kernel_profiler import KernelProfiler
-from kernel_profiler import KernelStatOptions as kso
+from kernel_profiler import KernelStatOptions as kso  # noqa
 
 
 knl = lp.make_kernel(
@@ -31,7 +31,7 @@ param_dict = {'n': n, 'm': m, 'ell': ell}
 kp = KernelProfiler(
         #"NVIDIA", "GEFORCE",
         #"NVIDIA", "K40C",
-        evaluate_polys = True,
+        evaluate_polys=True,
         include_kernel_params_in_ptx_filename=True,
         )
 stats = kp.profile(
@@ -55,7 +55,7 @@ print(lp.stringify_stats_mapping(stats[kso.OP_MAP]))
 print(lp.stringify_stats_mapping(stats[kso.SYNC_MAP]))
 print(stats[kso.GRID_SIZES], "\n")
 print(stats[kso.FLOP_RATE], "\n")
-print(stats[kso.MEM_BANDWIDTH], "\n")
+print(stats[kso.MEM_BANDWIDTH][0], stats[kso.MEM_BANDWIDTH][1], "\n")
 
 kp.update_options(evaluate_polys=False)
 
diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py
index 4cbefbed0..1dd2223fa 100644
--- a/kernel_profiler/__init__.py
+++ b/kernel_profiler/__init__.py
@@ -77,6 +77,8 @@ class KernelProfiler(object):
                 ptx_filename_suffix="",
                 ):
 
+        # TODO create cache to store kernels (for executing w/different params)
+        # TODO create cache to store stats mappings
         self.ctx_cache = {}
         self.platform_name = platform_name
         self.device_name = device_name
@@ -402,6 +404,14 @@ class KernelProfiler(object):
             stats_found[kso.FLOP_RATE] = float_ops/stats_found[kso.WALL_TIME]
 
         if kso.MEM_BANDWIDTH in stat_options:
+            # first get footprint of data moved
+            from loopy import gather_access_footprint_bytes
+            footsize_bytes = 0
+            for access, count in stats_found[kso.MEM_ACCESS_MAP].items():
+                if access.mtype == "global":
+                    direction = "write" if access.direction == "store" else "read"
+                    footsize_bytes += gather_access_footprint_bytes(knl)[(access.variable, direction)].eval_with_dict(param_dict)
+
             # mem access counted w/subgroup granularity
             data_moved_bytes = stats_found[kso.MEM_ACCESS_MAP].filter_by(
                     mtype=["global"],
@@ -412,10 +422,14 @@ class KernelProfiler(object):
                     mtype=["global"],
                     count_granularity=[lp.CountGranularity.WORKITEM],
                     ).to_bytes().sum()
+            # if these polys have not alread been evaluated, evaluate them
             if not self.evaluate_polys:
                 data_moved_bytes = data_moved_bytes.eval_with_dict(param_dict)
-            stats_found[kso.MEM_BANDWIDTH] = \
-                    data_moved_bytes/stats_found[kso.WALL_TIME]
+
+            stats_found[kso.MEM_BANDWIDTH] = (
+                    data_moved_bytes/stats_found[kso.WALL_TIME],
+                    footsize_bytes/stats_found[kso.WALL_TIME]
+                    )
 
         if kso.SAVE_PTX in stat_options:
             self.save_ptx(knl, param_dict)
-- 
GitLab


From 8198c328bec28743678ebbdae78e675b9ec630d9 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Sat, 2 Feb 2019 20:10:28 -0600
Subject: [PATCH 20/32] caching stats maps; combined separate stats getting
 functions into one to reduce redundant code

---
 kernel_profiler/__init__.py | 120 +++++++++++++++++-------------------
 1 file changed, 57 insertions(+), 63 deletions(-)

diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py
index 1dd2223fa..7f0d150a3 100644
--- a/kernel_profiler/__init__.py
+++ b/kernel_profiler/__init__.py
@@ -1,5 +1,6 @@
 import pyopencl as cl
 import loopy as lp
+from loopy.preprocess import prepare_for_caching
 
 
 def create_rand_args(ctx, knl, param_dict):
@@ -78,8 +79,8 @@ class KernelProfiler(object):
                 ):
 
         # TODO create cache to store kernels (for executing w/different params)
-        # TODO create cache to store stats mappings
         self.ctx_cache = {}
+        self.stats_mapping_cache = {}
         self.platform_name = platform_name
         self.device_name = device_name
         self.interactive = interactive
@@ -237,69 +238,57 @@ class KernelProfiler(object):
         else:
             write_ptx(self.get_cl_context(knl), knl)
 
-    def get_mem_access_stats(
+    def get_cached_stats_mapping(
                 self,
                 knl,
-                param_dict=None,
-                ):
-
-        from loopy.statistics import get_mem_access_map
-
-        mem_access_map = get_mem_access_map(
-                knl,
-                count_redundant_work=self.count_redundant_work,
-                subgroup_size=self.subgroup_size,
-                )
-
-        if self.evaluate_polys:
-            if param_dict is None:
-                raise ValueError("Cannont evaluate polynomials without param_dict.")
-            return mem_access_map.eval(param_dict)
-        else:
-            return mem_access_map
-
-    def get_op_stats(
-                self,
-                knl,
-                param_dict=None,
+                stat_option,  # KernelStatOptions
                 ):
 
-        from loopy.statistics import get_op_map
-
-        op_map = get_op_map(
-            knl,
-            count_redundant_work=self.count_redundant_work,
-            count_within_subscripts=self.count_within_subscripts,
-            subgroup_size=self.subgroup_size,
-            count_madds=self.count_madds,
-            )
+        cache_key = (prepare_for_caching(knl), stat_option)
+        # TODO avoid multiple calls to prepare_for_caching()?
 
-        if self.evaluate_polys:
-            if param_dict is None:
-                raise ValueError("Cannont evaluate polynomials without param_dict.")
-            return op_map.eval(param_dict)
-        else:
-            return op_map
+        try:
+            return self.stats_mapping_cache[cache_key]
+        except KeyError:
+            if stat_option == KernelStatOptions.MEM_ACCESS_MAP:
+                from loopy.statistics import get_mem_access_map
+                stats_map = get_mem_access_map(
+                        knl,
+                        count_redundant_work=self.count_redundant_work,
+                        subgroup_size=self.subgroup_size,
+                        )
+            elif stat_option == KernelStatOptions.OP_MAP:
+                from loopy.statistics import get_op_map
+                stats_map = get_op_map(
+                        knl,
+                        count_redundant_work=self.count_redundant_work,
+                        count_within_subscripts=self.count_within_subscripts,
+                        subgroup_size=self.subgroup_size,
+                        count_madds=self.count_madds,
+                        )
+            elif stat_option == KernelStatOptions.SYNC_MAP:
+                from loopy.statistics import get_synchronization_map
+                stats_map = get_synchronization_map(
+                        knl,
+                        subgroup_size=self.subgroup_size,
+                        )
+            self.stats_mapping_cache[cache_key] = stats_map
+            return stats_map
 
-    def get_synchronization_stats(
+    def get_stats_mapping_and_evaluate_if_required(
                 self,
                 knl,
+                stat_option,  # KernelStatOptions
                 param_dict=None,
                 ):
-
-        from loopy.statistics import get_synchronization_map
-
-        sync_map = get_synchronization_map(
-            knl,
-            subgroup_size=self.subgroup_size,
-            )
-
+        stats_map = self.get_cached_stats_mapping(knl, stat_option)
         if self.evaluate_polys:
             if param_dict is None:
-                raise ValueError("Cannont evaluate polynomials without param_dict.")
-            return sync_map.eval(param_dict)
+                raise ValueError(
+                        "Cannot evaluate polynomials without param_dict.")
+            return stats_map.eval(param_dict)
         else:
-            return sync_map
+            return stats_map
 
     def get_grid_sizes(
                 self,
@@ -345,24 +334,29 @@ class KernelProfiler(object):
 
         if kso.MEM_ACCESS_MAP in stat_options or \
                 kso.MEM_BANDWIDTH in stat_options:
-            stats_found[kso.MEM_ACCESS_MAP] = self.get_mem_access_stats(
-                    knl,
-                    param_dict=param_dict,
-                    )
+            stats_found[kso.MEM_ACCESS_MAP] = \
+                    self.get_stats_mapping_and_evaluate_if_required(
+                            knl,
+                            kso.MEM_ACCESS_MAP,
+                            param_dict=param_dict,
+                            )
 
         if kso.OP_MAP in stat_options or \
                 kso.FLOP_RATE in stat_options:
-            stats_found[kso.OP_MAP] = self.get_op_stats(
-                    knl,
-                    param_dict=param_dict,
-                    )
+            stats_found[kso.OP_MAP] = \
+                    self.get_stats_mapping_and_evaluate_if_required(
+                            knl,
+                            kso.OP_MAP,
+                            param_dict=param_dict,
+                            )
 
         if kso.SYNC_MAP in stat_options:
             stats_found[kso.SYNC_MAP] = \
-                    self.get_synchronization_stats(
-                    knl,
-                    param_dict=param_dict,
-                    )
+                    self.get_stats_mapping_and_evaluate_if_required(
+                            knl,
+                            kso.SYNC_MAP,
+                            param_dict=param_dict,
+                            )
 
         if kso.GRID_SIZES in stat_options:
             stats_found[kso.GRID_SIZES] = self.get_grid_sizes(
-- 
GitLab


From f220705a0a6b12ec10f2dba4eaba9a250602fca9 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Sat, 2 Feb 2019 20:31:58 -0600
Subject: [PATCH 21/32] caching grid sizes

---
 kernel_profiler/__init__.py | 49 ++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py
index 7f0d150a3..5d83df7af 100644
--- a/kernel_profiler/__init__.py
+++ b/kernel_profiler/__init__.py
@@ -80,7 +80,7 @@ class KernelProfiler(object):
 
         # TODO create cache to store kernels (for executing w/different params)
         self.ctx_cache = {}
-        self.stats_mapping_cache = {}
+        self.stat_cache = {}
         self.platform_name = platform_name
         self.device_name = device_name
         self.interactive = interactive
@@ -238,7 +238,7 @@ class KernelProfiler(object):
         else:
             write_ptx(self.get_cl_context(knl), knl)
 
-    def get_cached_stats_mapping(
+    def get_cached_stats_map(
                 self,
                 knl,
                 stat_option,  # KernelStatOptions
@@ -248,7 +248,7 @@ class KernelProfiler(object):
         # TODO avoid multiple calls to prepare_for_caching()?
 
         try:
-            return self.stats_mapping_cache[cache_key]
+            return self.stat_cache[cache_key]
         except KeyError:
             if stat_option == KernelStatOptions.MEM_ACCESS_MAP:
                 from loopy.statistics import get_mem_access_map
@@ -272,16 +272,16 @@ class KernelProfiler(object):
                         knl,
                         subgroup_size=self.subgroup_size,
                         )
-            self.stats_mapping_cache[cache_key] = stats_map
+            self.stat_cache[cache_key] = stats_map
             return stats_map
 
-    def get_stats_mapping_and_evaluate_if_required(
+    def get_stats_map_and_evaluate_if_required(
                 self,
                 knl,
                 stat_option,  # KernelStatOptions
                 param_dict=None,
                 ):
-        stats_map = self.get_cached_stats_mapping(knl, stat_option)
+        stats_map = self.get_cached_stats_map(knl, stat_option)
         if self.evaluate_polys:
             if param_dict is None:
                 raise ValueError(
@@ -296,23 +296,32 @@ class KernelProfiler(object):
                 param_dict=None,
                 ):
 
-        global_size, local_size = knl.get_grid_size_upper_bounds()
+        cache_key = (prepare_for_caching(knl), KernelStatOptions.GRID_SIZES)
+        # TODO avoid multiple calls to prepare_for_caching()?
+
+        try:
+            grid_sizes = self.stat_cache[cache_key]
+        except KeyError:
+
+            global_size, local_size = knl.get_grid_size_upper_bounds()
 
-        from islpy import PwQPolynomial
-        gsize_pwqs = []
-        lsize_pwqs = []
-        for gsize in global_size:
-            gsize_pwqs.append(PwQPolynomial.from_pw_aff(gsize))
-        for lsize in local_size:
-            lsize_pwqs.append(PwQPolynomial.from_pw_aff(lsize))
+            from islpy import PwQPolynomial
+            gsize_pwqs = []
+            lsize_pwqs = []
+            for gsize in global_size:
+                gsize_pwqs.append(PwQPolynomial.from_pw_aff(gsize))
+            for lsize in local_size:
+                lsize_pwqs.append(PwQPolynomial.from_pw_aff(lsize))
+            grid_sizes = [gsize_pwqs, lsize_pwqs]
+            self.stat_cache[cache_key] = grid_sizes
 
         if self.evaluate_polys:
             if param_dict is None:
                 raise ValueError("Cannont evaluate polynomials without param_dict.")
-            return [g.eval_with_dict(param_dict) for g in gsize_pwqs], \
-                   [l.eval_with_dict(param_dict) for l in lsize_pwqs]
+            return [g.eval_with_dict(param_dict) for g in grid_sizes[0]], \
+                   [l.eval_with_dict(param_dict) for l in grid_sizes[1]]
         else:
-            return gsize_pwqs, lsize_pwqs
+            return grid_sizes
 
     def profile(
                 self,
@@ -335,7 +344,7 @@ class KernelProfiler(object):
         if kso.MEM_ACCESS_MAP in stat_options or \
                 kso.MEM_BANDWIDTH in stat_options:
             stats_found[kso.MEM_ACCESS_MAP] = \
-                    self.get_stats_mapping_and_evaluate_if_required(
+                    self.get_stats_map_and_evaluate_if_required(
                             knl,
                             kso.MEM_ACCESS_MAP,
                             param_dict=param_dict,
@@ -344,7 +353,7 @@ class KernelProfiler(object):
         if kso.OP_MAP in stat_options or \
                 kso.FLOP_RATE in stat_options:
             stats_found[kso.OP_MAP] = \
-                    self.get_stats_mapping_and_evaluate_if_required(
+                    self.get_stats_map_and_evaluate_if_required(
                             knl,
                             kso.OP_MAP,
                             param_dict=param_dict,
@@ -352,7 +361,7 @@ class KernelProfiler(object):
 
         if kso.SYNC_MAP in stat_options:
             stats_found[kso.SYNC_MAP] = \
-                    self.get_stats_mapping_and_evaluate_if_required(
+                    self.get_stats_map_and_evaluate_if_required(
                             knl,
                             kso.SYNC_MAP,
                             param_dict=param_dict,
-- 
GitLab


From 7f054b2972b83c38b2b6d37f8b271296b1397b1e Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Sat, 2 Feb 2019 20:36:31 -0600
Subject: [PATCH 22/32] in footprint counting, filtering mem map by
 mtype=global before iterating rather than checking mtype==global

---
 kernel_profiler/__init__.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py
index 5d83df7af..449088eb4 100644
--- a/kernel_profiler/__init__.py
+++ b/kernel_profiler/__init__.py
@@ -410,10 +410,11 @@ class KernelProfiler(object):
             # first get footprint of data moved
             from loopy import gather_access_footprint_bytes
             footsize_bytes = 0
-            for access, count in stats_found[kso.MEM_ACCESS_MAP].items():
-                if access.mtype == "global":
-                    direction = "write" if access.direction == "store" else "read"
-                    footsize_bytes += gather_access_footprint_bytes(knl)[(access.variable, direction)].eval_with_dict(param_dict)
+            for access, count in stats_found[kso.MEM_ACCESS_MAP].filter_by(
+                        mtype=["global"]).items():
+                direction = "write" if access.direction == "store" else "read"
+                footsize_bytes += gather_access_footprint_bytes(knl)[
+                        (access.variable, direction)].eval_with_dict(param_dict)
 
             # mem access counted w/subgroup granularity
             data_moved_bytes = stats_found[kso.MEM_ACCESS_MAP].filter_by(
-- 
GitLab


From 861fb7e521813dac9f7f40504f2fe42cd861dd72 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Wed, 13 Mar 2019 15:20:08 -0500
Subject: [PATCH 23/32] printing generated code in example

---
 examples/example.py         | 3 +++
 kernel_profiler/__init__.py | 1 +
 2 files changed, 4 insertions(+)

diff --git a/examples/example.py b/examples/example.py
index 09c25a7e8..f0b2e970d 100644
--- a/examples/example.py
+++ b/examples/example.py
@@ -49,6 +49,9 @@ stats = kp.profile(
         ],
         param_dict=param_dict,
         )
+
+print(stats[kso.GENERATED_CODE])
+
 print("\nWall time:", stats[kso.WALL_TIME], "\n")
 print(lp.stringify_stats_mapping(stats[kso.MEM_ACCESS_MAP]))
 print(lp.stringify_stats_mapping(stats[kso.OP_MAP]))
diff --git a/kernel_profiler/__init__.py b/kernel_profiler/__init__.py
index 449088eb4..c6834a985 100644
--- a/kernel_profiler/__init__.py
+++ b/kernel_profiler/__init__.py
@@ -58,6 +58,7 @@ class KernelStatOptions:
     MEM_BANDWIDTH = "mem_bandwidth"
     SAVE_PTX = "save_ptx"
     GENERATED_CODE = "generated_code"
+    # TODO mem access to footprint ratio
 
 
 class KernelProfiler(object):
-- 
GitLab


From f16dd472b62eb79e9d169924a346002d9e61f873 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Wed, 13 Mar 2019 15:37:00 -0500
Subject: [PATCH 24/32] renamed kernel_profiler

---
 kernel_profiler/{__init__.py => kernel_profiler.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename kernel_profiler/{__init__.py => kernel_profiler.py} (100%)

diff --git a/kernel_profiler/__init__.py b/kernel_profiler/kernel_profiler.py
similarity index 100%
rename from kernel_profiler/__init__.py
rename to kernel_profiler/kernel_profiler.py
-- 
GitLab


From 899999ebec8f99de49d8b9365b1017a9bde9fb5c Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Wed, 1 May 2019 12:28:10 -0500
Subject: [PATCH 25/32] moved kernel profiler

---
 .../kernel_profiler}/kernel_profiler.py       | 30 ++++++++++++++-----
 1 file changed, 23 insertions(+), 7 deletions(-)
 rename {kernel_profiler => loopy/kernel_profiler}/kernel_profiler.py (93%)

diff --git a/kernel_profiler/kernel_profiler.py b/loopy/kernel_profiler/kernel_profiler.py
similarity index 93%
rename from kernel_profiler/kernel_profiler.py
rename to loopy/kernel_profiler/kernel_profiler.py
index c6834a985..13de9c0ca 100644
--- a/kernel_profiler/kernel_profiler.py
+++ b/loopy/kernel_profiler/kernel_profiler.py
@@ -260,13 +260,26 @@ class KernelProfiler(object):
                         )
             elif stat_option == KernelStatOptions.OP_MAP:
                 from loopy.statistics import get_op_map
-                stats_map = get_op_map(
-                        knl,
-                        count_redundant_work=self.count_redundant_work,
-                        count_within_subscripts=self.count_within_subscripts,
-                        subgroup_size=self.subgroup_size,
-                        count_madds=self.count_madds,
-                        )
+                if self.count_madds:
+                    # TODO once madd counting branch is merged, remove this conditional
+                    try:
+                        stats_map = get_op_map(
+                                knl,
+                                count_redundant_work=self.count_redundant_work,
+                                count_within_subscripts=self.count_within_subscripts,
+                                subgroup_size=self.subgroup_size,
+                                count_madds=self.count_madds,
+                                )
+                    except TypeError:
+                        raise NotImplementedError(
+                            "count_madds requires the unmerged madd counting branch.")
+                else:
+                    stats_map = get_op_map(
+                            knl,
+                            count_redundant_work=self.count_redundant_work,
+                            count_within_subscripts=self.count_within_subscripts,
+                            subgroup_size=self.subgroup_size,
+                            )
             elif stat_option == KernelStatOptions.SYNC_MAP:
                 from loopy.statistics import get_synchronization_map
                 stats_map = get_synchronization_map(
@@ -431,6 +444,9 @@ class KernelProfiler(object):
             if not self.evaluate_polys:
                 data_moved_bytes = data_moved_bytes.eval_with_dict(param_dict)
 
+            # TODO decide on better way to handle multiple count granularities here
+            # (uniform access only counted once per warp)
+
             stats_found[kso.MEM_BANDWIDTH] = (
                     data_moved_bytes/stats_found[kso.WALL_TIME],
                     footsize_bytes/stats_found[kso.WALL_TIME]
-- 
GitLab


From d32f00e0325bff7caf5f808a4ea896b0f223b911 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Wed, 1 May 2019 12:28:22 -0500
Subject: [PATCH 26/32] added kernel_profiler example

---
 examples/python/kernel_profiler.py | 87 ++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 examples/python/kernel_profiler.py

diff --git a/examples/python/kernel_profiler.py b/examples/python/kernel_profiler.py
new file mode 100644
index 000000000..674c75c7e
--- /dev/null
+++ b/examples/python/kernel_profiler.py
@@ -0,0 +1,87 @@
+import loopy as lp
+import numpy as np
+from loopy.kernel_profiler.kernel_profiler import KernelProfiler
+from loopy.kernel_profiler.kernel_profiler import KernelStatOptions as kso  # noqa
+
+
+knl = lp.make_kernel(
+        "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
+        [
+            "c[i, j] = sum(k, a[i, k]*b[k, j])"
+        ],
+        name="matmul",
+        assumptions="n,m,ell >= 1",
+        lang_version=(2018, 2),
+        )
+
+knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
+
+lsize = 16
+knl = lp.split_iname(knl, "i", lsize, outer_tag="g.0", inner_tag="l.1")
+knl = lp.split_iname(knl, "j", lsize, outer_tag="g.1", inner_tag="l.0")
+knl = lp.split_iname(knl, "k", lsize)
+knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
+knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")
+
+n = 2**10
+m = 2**11
+ell = 2**9
+param_dict = {'n': n, 'm': m, 'ell': ell}
+
+kp = KernelProfiler(
+        #"NVIDIA", "GEFORCE",
+        #"NVIDIA", "K40C",
+        evaluate_polys=False,
+        count_madds=False,  # TODO enables this after madd counting branch is merged
+        include_kernel_params_in_ptx_filename=True,
+        )
+stats = kp.profile(
+        knl,
+        [
+            kso.WALL_TIME,
+            kso.MEM_ACCESS_MAP,
+            kso.OP_MAP,
+            kso.SYNC_MAP,
+            kso.GRID_SIZES,
+            kso.FLOP_RATE,
+            kso.MEM_BANDWIDTH,
+            kso.SAVE_PTX,
+            kso.GENERATED_CODE,
+        ],
+        param_dict=param_dict,
+        )
+
+print(stats[kso.GENERATED_CODE])
+
+inv_giga = 10**-9
+
+print("\nWall time:")
+print(stats[kso.WALL_TIME])
+print("\nMem access map:")
+print(lp.stringify_stats_mapping(stats[kso.MEM_ACCESS_MAP]))
+print("\nOp map:")
+print(lp.stringify_stats_mapping(stats[kso.OP_MAP]))
+print("\nSync map:")
+print(lp.stringify_stats_mapping(stats[kso.SYNC_MAP]))
+print("\nGrid sizes:")
+print(stats[kso.GRID_SIZES])
+print("\nFlop rate (GFLOP/s):")
+print(stats[kso.FLOP_RATE]*inv_giga)
+print("\nMem throughput rate (GB/s) (total data accessed, data footprint only):")
+print(stats[kso.MEM_BANDWIDTH][0]*inv_giga, stats[kso.MEM_BANDWIDTH][1]*inv_giga, "\n")
+
+kp.update_options(evaluate_polys=True)
+
+stats = kp.profile(
+        knl, stats, param_dict=param_dict)
+
+print("Now change eval_polys to True =========================================")
+print("\nMem access map:")
+print(lp.stringify_stats_mapping(stats[kso.MEM_ACCESS_MAP]))
+print("\nOp map:")
+print(lp.stringify_stats_mapping(stats[kso.OP_MAP]))
+print("\nSync map:")
+print(lp.stringify_stats_mapping(stats[kso.SYNC_MAP]))
+
+print("\nWall time:")
+print(stats[kso.WALL_TIME])
-- 
GitLab


From ba305d3003bb1b181bb559e60aae9bb0e2c72b04 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Wed, 1 May 2019 12:45:21 -0500
Subject: [PATCH 27/32] when no filename passed to write_ptx, write to stdout

---
 loopy/kernel_profiler/kernel_profiler.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/loopy/kernel_profiler/kernel_profiler.py b/loopy/kernel_profiler/kernel_profiler.py
index 13de9c0ca..8beaa539a 100644
--- a/loopy/kernel_profiler/kernel_profiler.py
+++ b/loopy/kernel_profiler/kernel_profiler.py
@@ -42,10 +42,16 @@ def write_ptx(ctx, knl, filename=None):
                             ctx, lp.generate_code_v2(knl).device_code()
                            ).build(options=knl.options.cl_build_options)
     ptx_src = cl_program.binaries[0]
-    if not filename:
-        filename = "ptx_"+knl.name+".ptx"
-    ptx_src_file = open(filename, 'w')
-    ptx_src_file.write(ptx_src.decode('utf-8', 'ignore'))
+    if filename:
+        ptx_src_file = open(filename, 'w')
+        ptx_src_file.write(ptx_src.decode('utf-8', 'ignore'))
+    else:
+        from loopy.diagnostic import warn_with_kernel
+        warn_with_kernel(knl, "write_ptx_no_filename",
+            "No filename passed to write_ptx for kernel %s, writing to stdout"
+            % (knl.name))
+        import sys
+        sys.stdout.write(ptx_src.decode('utf-8', 'ignore')+"\n")
 
 
 class KernelStatOptions:
-- 
GitLab


From e7b4300996dbf0bc6486d38c32f157dc1f230a8a Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Wed, 1 May 2019 12:48:06 -0500
Subject: [PATCH 28/32] fixing flake8 issues

---
 loopy/kernel_profiler/kernel_profiler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/loopy/kernel_profiler/kernel_profiler.py b/loopy/kernel_profiler/kernel_profiler.py
index 8beaa539a..71b6eb817 100644
--- a/loopy/kernel_profiler/kernel_profiler.py
+++ b/loopy/kernel_profiler/kernel_profiler.py
@@ -267,7 +267,7 @@ class KernelProfiler(object):
             elif stat_option == KernelStatOptions.OP_MAP:
                 from loopy.statistics import get_op_map
                 if self.count_madds:
-                    # TODO once madd counting branch is merged, remove this conditional
+                    # TODO once madd counting branch is merged, remove conditional
                     try:
                         stats_map = get_op_map(
                                 knl,
@@ -278,7 +278,7 @@ class KernelProfiler(object):
                                 )
                     except TypeError:
                         raise NotImplementedError(
-                            "count_madds requires the unmerged madd counting branch.")
+                            "count_madds requires unmerged madd counting branch.")
                 else:
                     stats_map = get_op_map(
                             knl,
-- 
GitLab


From ea61f1b0c784842b59b5de1a0cba8fd6870cb155 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Fri, 31 May 2019 03:18:19 -0500
Subject: [PATCH 29/32] changed update_options() to copy(); new copy of
 everything except cache

---
 examples/python/kernel_profiler.py       | 10 +--
 loopy/kernel_profiler/kernel_profiler.py | 99 ++++++++++++++----------
 2 files changed, 64 insertions(+), 45 deletions(-)

diff --git a/examples/python/kernel_profiler.py b/examples/python/kernel_profiler.py
index 674c75c7e..71bcad903 100644
--- a/examples/python/kernel_profiler.py
+++ b/examples/python/kernel_profiler.py
@@ -23,9 +23,9 @@ knl = lp.split_iname(knl, "k", lsize)
 knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
 knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")
 
-n = 2**10
-m = 2**11
-ell = 2**9
+n = 2**8
+m = 2**9
+ell = 2**7
 param_dict = {'n': n, 'm': m, 'ell': ell}
 
 kp = KernelProfiler(
@@ -70,9 +70,9 @@ print(stats[kso.FLOP_RATE]*inv_giga)
 print("\nMem throughput rate (GB/s) (total data accessed, data footprint only):")
 print(stats[kso.MEM_BANDWIDTH][0]*inv_giga, stats[kso.MEM_BANDWIDTH][1]*inv_giga, "\n")
 
-kp.update_options(evaluate_polys=True)
+kp2 = kp.copy(evaluate_polys=True)
 
-stats = kp.profile(
+stats = kp2.profile(
         knl, stats, param_dict=param_dict)
 
 print("Now change eval_polys to True =========================================")
diff --git a/loopy/kernel_profiler/kernel_profiler.py b/loopy/kernel_profiler/kernel_profiler.py
index 71b6eb817..0e4069560 100644
--- a/loopy/kernel_profiler/kernel_profiler.py
+++ b/loopy/kernel_profiler/kernel_profiler.py
@@ -105,46 +105,65 @@ class KernelProfiler(object):
                 include_kernel_params_in_ptx_filename
         self.ptx_filename_suffix = ptx_filename_suffix
 
-    def update_options(
-                self,
-                platform_name=None,
-                device_name=None,
-                interactive=None,
-                n_warmup_wtime_trials=None,
-                n_wtime_trials=None,
-                evaluate_polys=None,
-                count_redundant_work=None,
-                subgroup_size=None,
-                count_madds=None,
-                count_within_subscripts=None,
-                include_kernel_params_in_ptx_filename=None,
-                ptx_filename_suffix=None,
-                ):
-        if platform_name is not None:
-            self.platform_name = platform_name
-        if device_name is not None:
-            self.device_name = device_name
-        if interactive is not None:
-            self.interactive = interactive
-        if n_warmup_wtime_trials is not None:
-            self.n_warmup_wtime_trials = n_warmup_wtime_trials
-        if n_wtime_trials is not None:
-            self.n_wtime_trials = n_wtime_trials
-        if evaluate_polys is not None:
-            self.evaluate_polys = evaluate_polys
-        if count_redundant_work is not None:
-            self.count_redundant_work = count_redundant_work
-        if subgroup_size is not None:
-            self.subgroup_size = subgroup_size
-        if count_madds is not None:
-            self.count_madds = count_madds
-        if count_within_subscripts is not None:
-            self.count_within_subscripts = count_within_subscripts
-        if include_kernel_params_in_ptx_filename is not None:
-            self.include_kernel_params_in_ptx_filename = \
-                    include_kernel_params_in_ptx_filename
-        if ptx_filename_suffix is not None:
-            self.ptx_filename_suffix = ptx_filename_suffix
+    def copy(
+            self,
+            platform_name=None,
+            device_name=None,
+            interactive=None,
+            n_warmup_time_trials=None,
+            n_time_trials=None,
+            evaluate_polys=None,
+            subgroup_size=None,
+            count_redundant_work=None,
+            count_madds=None,
+            count_within_subscripts=None,
+            include_kernel_params_in_ptx_filename=None,
+            ptx_filename_suffix=None,
+            ):
+
+        platform_name_new = self.platform_name \
+            if platform_name is None else platform_name
+        device_name_new = self.device_name if device_name is None else device_name
+        interactive_new = self.interactive if interactive is None else interactive
+        n_warmup_time_trials_new = self.n_warmup_time_trials \
+            if n_warmup_time_trials is None else n_warmup_time_trials
+        n_time_trials_new = self.n_time_trials \
+            if n_time_trials is None else n_time_trials
+        evaluate_polys_new = self.evaluate_polys \
+            if evaluate_polys is None else evaluate_polys
+        count_redundant_work_new = self.count_redundant_work \
+            if count_redundant_work is None else count_redundant_work
+        subgroup_size_new = self.subgroup_size \
+            if subgroup_size is None else subgroup_size
+        count_madds_new = self.count_madds if count_madds is None else count_madds
+        count_within_subscripts_new = self.count_within_subscripts \
+            if count_within_subscripts is None else count_within_subscripts
+        include_kernel_params_in_ptx_filename_new = \
+            self.include_kernel_params_in_ptx_filename \
+            if include_kernel_params_in_ptx_filename is None \
+            else include_kernel_params_in_ptx_filename
+        ptx_filename_suffix_new = self.ptx_filename_suffix \
+            if ptx_filename_suffix is None else ptx_filename_suffix
+
+        profiler_new = KernelProfiler(
+            platform_name=platform_name_new,
+            device_name=device_name_new,
+            interactive=interactive_new,
+            n_warmup_time_trials=n_warmup_time_trials_new,
+            n_time_trials=n_time_trials_new,
+            evaluate_polys=evaluate_polys_new,
+            subgroup_size=subgroup_size_new,
+            count_redundant_work=count_redundant_work_new,
+            count_madds=count_madds_new,
+            count_within_subscripts=count_within_subscripts_new,
+            include_kernel_params_in_ptx_filename=  # noqa
+                include_kernel_params_in_ptx_filename_new,
+            ptx_filename_suffix=ptx_filename_suffix_new,
+            )
+        profiler_new.ctx_cache = self.ctx_cache
+        profiler_new.stat_cache = self.stat_cache
+
+        return profiler_new
 
     def get_cl_context(self, knl):
 
-- 
GitLab


From 8f270c16648af46f30fc56e5c7f0ab8859adedcf Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Thu, 6 Jun 2019 23:42:31 -0500
Subject: [PATCH 30/32] renaming profiler->stat_collector (still need to change
 file/dir names)

---
 examples/python/kernel_profiler.py       |  8 ++++----
 loopy/kernel_profiler/kernel_profiler.py | 18 +++++++++---------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/examples/python/kernel_profiler.py b/examples/python/kernel_profiler.py
index 71bcad903..f5aa90bb0 100644
--- a/examples/python/kernel_profiler.py
+++ b/examples/python/kernel_profiler.py
@@ -1,6 +1,6 @@
 import loopy as lp
 import numpy as np
-from loopy.kernel_profiler.kernel_profiler import KernelProfiler
+from loopy.kernel_profiler.kernel_profiler import KernelStatCollector
 from loopy.kernel_profiler.kernel_profiler import KernelStatOptions as kso  # noqa
 
 
@@ -28,14 +28,14 @@ m = 2**9
 ell = 2**7
 param_dict = {'n': n, 'm': m, 'ell': ell}
 
-kp = KernelProfiler(
+kp = KernelStatCollector(
         #"NVIDIA", "GEFORCE",
         #"NVIDIA", "K40C",
         evaluate_polys=False,
         count_madds=False,  # TODO enables this after madd counting branch is merged
         include_kernel_params_in_ptx_filename=True,
         )
-stats = kp.profile(
+stats = kp.collect_stats(
         knl,
         [
             kso.WALL_TIME,
@@ -72,7 +72,7 @@ print(stats[kso.MEM_BANDWIDTH][0]*inv_giga, stats[kso.MEM_BANDWIDTH][1]*inv_giga
 
 kp2 = kp.copy(evaluate_polys=True)
 
-stats = kp2.profile(
+stats = kp2.collect_stats(
         knl, stats, param_dict=param_dict)
 
 print("Now change eval_polys to True =========================================")
diff --git a/loopy/kernel_profiler/kernel_profiler.py b/loopy/kernel_profiler/kernel_profiler.py
index 0e4069560..c22726b5c 100644
--- a/loopy/kernel_profiler/kernel_profiler.py
+++ b/loopy/kernel_profiler/kernel_profiler.py
@@ -67,7 +67,7 @@ class KernelStatOptions:
     # TODO mem access to footprint ratio
 
 
-class KernelProfiler(object):
+class KernelStatCollector(object):
 
     def __init__(
                 self,
@@ -145,7 +145,7 @@ class KernelProfiler(object):
         ptx_filename_suffix_new = self.ptx_filename_suffix \
             if ptx_filename_suffix is None else ptx_filename_suffix
 
-        profiler_new = KernelProfiler(
+        stat_collector_new = KernelStatCollector(
             platform_name=platform_name_new,
             device_name=device_name_new,
             interactive=interactive_new,
@@ -160,10 +160,10 @@ class KernelProfiler(object):
                 include_kernel_params_in_ptx_filename_new,
             ptx_filename_suffix=ptx_filename_suffix_new,
             )
-        profiler_new.ctx_cache = self.ctx_cache
-        profiler_new.stat_cache = self.stat_cache
+        stat_collector_new.ctx_cache = self.ctx_cache
+        stat_collector_new.stat_cache = self.stat_cache
 
-        return profiler_new
+        return stat_collector_new
 
     def get_cl_context(self, knl):
 
@@ -178,7 +178,7 @@ class KernelProfiler(object):
                     self.device_name is not None
                     and self.device_name not in knl_device_name):
                 raise ValueError("kernel target platform %s and/or device %s do "
-                        "not match profiler platform %s and/or device %s."
+                        "not match KernelStatCollector platform %s and/or device %s."
                         % (knl_platform_name, knl_device_name,
                         self.platform_name, self.device_name))
 
@@ -194,7 +194,7 @@ class KernelProfiler(object):
 
         elif self.platform_name is None or self.device_name is None:
             # kernel does not have a pre-specified device,
-            # and profiler does not know platform+device
+            # and KernelStatCollector does not know platform+device
             ctx = cl.create_some_context()  # interactive mode
             self.platform_name = ctx.devices[0].platform.name
             self.device_name = ctx.devices[0].name
@@ -202,7 +202,7 @@ class KernelProfiler(object):
             return ctx
 
         else:
-            # profiler knows both platform and device already
+            # KernelStatCollector knows both platform and device already
             cache_key = (self.platform_name, self.device_name, "ctx")
             try:
                 return self.ctx_cache[cache_key]
@@ -362,7 +362,7 @@ class KernelProfiler(object):
         else:
             return grid_sizes
 
-    def profile(
+    def collect_stats(
                 self,
                 knl,
                 stat_options=[],
-- 
GitLab


From f42a619426ab4eeeb649bd9e20ed1c846cb5a249 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Thu, 6 Jun 2019 23:44:33 -0500
Subject: [PATCH 31/32] renaming directories/files for kernel profiler -> stat
 collector

---
 .../{kernel_profiler.py => kernel_stat_collector.py}       | 7 ++++---
 .../kernel_stat_collector.py}                              | 0
 2 files changed, 4 insertions(+), 3 deletions(-)
 rename examples/python/{kernel_profiler.py => kernel_stat_collector.py} (89%)
 rename loopy/{kernel_profiler/kernel_profiler.py => kernel_stat_collector/kernel_stat_collector.py} (100%)

diff --git a/examples/python/kernel_profiler.py b/examples/python/kernel_stat_collector.py
similarity index 89%
rename from examples/python/kernel_profiler.py
rename to examples/python/kernel_stat_collector.py
index f5aa90bb0..5924b6b30 100644
--- a/examples/python/kernel_profiler.py
+++ b/examples/python/kernel_stat_collector.py
@@ -1,7 +1,7 @@
 import loopy as lp
 import numpy as np
-from loopy.kernel_profiler.kernel_profiler import KernelStatCollector
-from loopy.kernel_profiler.kernel_profiler import KernelStatOptions as kso  # noqa
+from loopy.kernel_stat_collector.kernel_stat_collector import KernelStatCollector
+from loopy.kernel_stat_collector.kernel_stat_collector import KernelStatOptions as kso  # noqa
 
 
 knl = lp.make_kernel(
@@ -68,7 +68,8 @@ print(stats[kso.GRID_SIZES])
 print("\nFlop rate (GFLOP/s):")
 print(stats[kso.FLOP_RATE]*inv_giga)
 print("\nMem throughput rate (GB/s) (total data accessed, data footprint only):")
-print(stats[kso.MEM_BANDWIDTH][0]*inv_giga, stats[kso.MEM_BANDWIDTH][1]*inv_giga, "\n")
+print(stats[kso.MEM_BANDWIDTH][0]*inv_giga,
+    stats[kso.MEM_BANDWIDTH][1]*inv_giga, "\n")
 
 kp2 = kp.copy(evaluate_polys=True)
 
diff --git a/loopy/kernel_profiler/kernel_profiler.py b/loopy/kernel_stat_collector/kernel_stat_collector.py
similarity index 100%
rename from loopy/kernel_profiler/kernel_profiler.py
rename to loopy/kernel_stat_collector/kernel_stat_collector.py
-- 
GitLab


From c5da129c3617acf0633c82daeca71e7d8c3b3bce Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Fri, 7 Jun 2019 00:07:54 -0500
Subject: [PATCH 32/32] processing subgroup size before using it to compute
 flops/throughput

---
 loopy/kernel_stat_collector/kernel_stat_collector.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/loopy/kernel_stat_collector/kernel_stat_collector.py b/loopy/kernel_stat_collector/kernel_stat_collector.py
index c22726b5c..026f9e74f 100644
--- a/loopy/kernel_stat_collector/kernel_stat_collector.py
+++ b/loopy/kernel_stat_collector/kernel_stat_collector.py
@@ -414,11 +414,13 @@ class KernelStatCollector(object):
 
         if kso.FLOP_RATE in stat_options:
             import numpy as np
+            from loopy.statistics import _process_subgroup_size
+            sgs_processed = _process_subgroup_size(knl, self.subgroup_size)
             # count madds as 2 ops
             # (count all flops once and then count the madds again)
 
             # flops counted w/subgroup granularity
-            float_ops = self.subgroup_size*(
+            float_ops = sgs_processed*(
                     stats_found[kso.OP_MAP].filter_by(
                         dtype=[np.float32, np.float64],
                         count_granularity=[lp.CountGranularity.SUBGROUP],
@@ -448,6 +450,8 @@ class KernelStatCollector(object):
         if kso.MEM_BANDWIDTH in stat_options:
             # first get footprint of data moved
             from loopy import gather_access_footprint_bytes
+            from loopy.statistics import _process_subgroup_size
+            sgs_processed = _process_subgroup_size(knl, self.subgroup_size)
             footsize_bytes = 0
             for access, count in stats_found[kso.MEM_ACCESS_MAP].filter_by(
                         mtype=["global"]).items():
@@ -459,7 +463,7 @@ class KernelStatCollector(object):
             data_moved_bytes = stats_found[kso.MEM_ACCESS_MAP].filter_by(
                     mtype=["global"],
                     count_granularity=[lp.CountGranularity.SUBGROUP],
-                    ).to_bytes().sum()*self.subgroup_size
+                    ).to_bytes().sum()*sgs_processed
             # mem access counted w/workitem granularity
             data_moved_bytes += stats_found[kso.MEM_ACCESS_MAP].filter_by(
                     mtype=["global"],
-- 
GitLab