From a50e257a0ddbefa89f14609e088c88b92c542d53 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Wed, 3 Apr 2019 19:31:58 -0500
Subject: [PATCH 01/35] Add an expression node for calls to external functions

This change adds the ExternalCall node and execution support for it.
External functions may be passed as part of the execution context.
---
 grudge/execution.py                  | 14 ++++++-
 grudge/symbolic/compiler.py          | 40 +++++++++++++-----
 grudge/symbolic/dofdesc_inference.py |  3 ++
 grudge/symbolic/mappers/__init__.py  | 31 ++++++++++++++
 grudge/symbolic/primitives.py        | 63 ++++++++++++++++++++--------
 test/test_grudge.py                  | 29 +++++++++++++
 6 files changed, 150 insertions(+), 30 deletions(-)

diff --git a/grudge/execution.py b/grudge/execution.py
index c20aa4bc..9084b823 100644
--- a/grudge/execution.py
+++ b/grudge/execution.py
@@ -94,13 +94,23 @@ class ExecutionMapper(mappers.Evaluator,
                 value = ary
         return value
 
-    def map_call(self, expr):
+    def map_external_call(self, expr):
         from pymbolic.primitives import Variable
         assert isinstance(expr.function, Variable)
+        args = [self.rec(p) for p in expr.parameters]
 
-        # FIXME: Make a way to register functions
+        return self.context[expr.function.name](*args)
+
+    def map_call(self, expr):
+        from pymbolic.primitives import Variable
+        assert isinstance(expr.function, Variable)
 
         args = [self.rec(p) for p in expr.parameters]
+
+        # Function lookup precedence:
+        # * Numpy functions
+        # * OpenCL functions
+
         from numbers import Number
         representative_arg = args[0]
         if (
diff --git a/grudge/symbolic/compiler.py b/grudge/symbolic/compiler.py
index b27d5585..2b25f0f0 100644
--- a/grudge/symbolic/compiler.py
+++ b/grudge/symbolic/compiler.py
@@ -667,13 +667,14 @@ def aggregate_assignments(inf_mapper, instructions, result,
                 for assignee in insn.get_assignees())
 
     from pytools import partition
-    from grudge.symbolic.primitives import DTAG_SCALAR
+    from grudge.symbolic.primitives import DTAG_SCALAR, ExternalCall
 
     unprocessed_assigns, other_insns = partition(
             lambda insn: (
                 isinstance(insn, Assign)
                 and not isinstance(insn, ToDiscretizationScopedAssign)
                 and not isinstance(insn, FromDiscretizationScopedAssign)
+                and not isinstance(insn.exprs[0], ExternalCall)
                 and not any(
                     inf_mapper.infer_for_name(n).domain_tag == DTAG_SCALAR
                     for n in insn.names)),
@@ -886,6 +887,10 @@ class ToLoopyExpressionMapper(mappers.IdentityMapper):
                 expr,
                 "%s_%d" % (expr.aggregate.name, subscript))
 
+    def map_external_call(self, expr):
+        raise ValueError(
+                "Cannot map external call '%s' into loopy" % expr.function)
+
     def map_call(self, expr):
         if isinstance(expr.function, sym.CFunction):
             from pymbolic import var
@@ -970,8 +975,11 @@ class ToLoopyInstructionMapper(object):
         self.dd_inference_mapper = dd_inference_mapper
 
     def map_insn_assign(self, insn):
-        from grudge.symbolic.primitives import OperatorBinding
-        if len(insn.exprs) == 1 and isinstance(insn.exprs[0], OperatorBinding):
+        from grudge.symbolic.primitives import OperatorBinding, ExternalCall
+
+        if (
+                len(insn.exprs) == 1
+                and isinstance(insn.exprs[0], (OperatorBinding, ExternalCall))):
             return insn
 
         iname = "grdg_i"
@@ -1261,6 +1269,17 @@ class OperatorCompiler(mappers.IdentityMapper):
                     prefix=name_hint)
             return result_var
 
+    def map_external_call(self, expr, codegen_state):
+        return self.assign_to_new_var(
+                    codegen_state,
+                    type(expr)(
+                        expr.function,
+                        [self.assign_to_new_var(
+                            codegen_state,
+                            self.rec(par, codegen_state))
+                            for par in expr.parameters],
+                        expr.dd))
+
     def map_call(self, expr, codegen_state):
         from grudge.symbolic.primitives import CFunction
         if isinstance(expr.function, CFunction):
@@ -1268,15 +1287,14 @@ class OperatorCompiler(mappers.IdentityMapper):
         else:
             # If it's not a C-level function, it shouldn't get muddled up into
             # a vector math expression.
-
             return self.assign_to_new_var(
-                    codegen_state,
-                    type(expr)(
-                        expr.function,
-                        [self.assign_to_new_var(
-                            codegen_state,
-                            self.rec(par, codegen_state))
-                            for par in expr.parameters]))
+                        codegen_state,
+                        type(expr)(
+                            expr.function,
+                            [self.assign_to_new_var(
+                                codegen_state,
+                                self.rec(par, codegen_state))
+                                for par in expr.parameters]))
 
     def map_ref_diff_op_binding(self, expr, codegen_state):
         try:
diff --git a/grudge/symbolic/dofdesc_inference.py b/grudge/symbolic/dofdesc_inference.py
index 92be126f..96ead088 100644
--- a/grudge/symbolic/dofdesc_inference.py
+++ b/grudge/symbolic/dofdesc_inference.py
@@ -191,6 +191,9 @@ class DOFDescInferenceMapper(RecursiveMapper, CSECachingMapperMixin):
         # FIXME
         return arg_dds[0]
 
+    def map_external_call(self, expr):
+        return expr.dd
+
     # }}}
 
     # {{{ instruction mappings
diff --git a/grudge/symbolic/mappers/__init__.py b/grudge/symbolic/mappers/__init__.py
index d52f7ac5..d1b70475 100644
--- a/grudge/symbolic/mappers/__init__.py
+++ b/grudge/symbolic/mappers/__init__.py
@@ -215,6 +215,12 @@ class IdentityMapperMixin(LocalOpReducerMixin, FluxOpReducerMixin):
     map_ones = map_grudge_variable
     map_node_coordinate_component = map_grudge_variable
 
+    def map_external_call(self, expr, *args, **kwargs):
+        return type(expr)(
+                self.rec(expr.function, *args, **kwargs),
+                self.rec(expr.parameters, *args, **kwargs),
+                dd=expr.dd)
+
     # }}}
 
 
@@ -268,6 +274,15 @@ class DependencyMapper(
     map_ones = _map_leaf
     map_node_coordinate_component = _map_leaf
 
+    def map_external_call(self, expr):
+        result = self.map_call(expr)
+        if self.include_calls == "descend_args":
+            # Unlike regular calls, we regard the function as an argument,
+            # because it's user-supplied (and thus we need to pick it up as a
+            # dependency).
+            result = self.combine((result, self.rec(expr.function)))
+        return result
+
 
 class FlopCounter(
         CombineMapperMixin,
@@ -281,6 +296,9 @@ class FlopCounter(
     def map_c_function(self, expr):
         return 1
 
+    def map_external_call(self, expr):
+        return 1
+
     def map_ones(self, expr):
         return 0
 
@@ -849,6 +867,15 @@ class StringifyMapper(pymbolic.mapper.stringifier.StringifyMapper):
     def map_interpolation(self, expr, enclosing_prec):
         return "Interp" + self._format_op_dd(expr)
 
+    def map_external_call(self, expr, enclosing_prec):
+        from pymbolic.mapper.stringifier import PREC_CALL, PREC_NONE
+        return (
+                self.parenthesize_if_needed(
+                    "External:%s:%s" % (
+                        self.map_call(expr, PREC_NONE), self._format_dd(expr.dd)),
+                    enclosing_prec,
+                    PREC_CALL))
+
 
 class PrettyStringifyMapper(
         pymbolic.mapper.stringifier.CSESplittingStringifyMapperMixin,
@@ -1224,6 +1251,7 @@ class CollectorMixin(OperatorReducerMixin, LocalOpReducerMixin, FluxOpReducerMix
     def map_constant(self, expr, *args, **kwargs):
         return set()
 
+    map_variable = map_constant
     map_grudge_variable = map_constant
     map_c_function = map_grudge_variable
 
@@ -1242,6 +1270,9 @@ class BoundOperatorCollector(CSECachingMapperMixin, CollectorMixin, CombineMappe
     def __init__(self, op_class):
         self.op_class = op_class
 
+    def map_external_call(self, expr):
+        return self.map_call(expr)
+
     map_common_subexpression_uncached = \
             CombineMapper.map_common_subexpression
 
diff --git a/grudge/symbolic/primitives.py b/grudge/symbolic/primitives.py
index 231a70df..65676118 100644
--- a/grudge/symbolic/primitives.py
+++ b/grudge/symbolic/primitives.py
@@ -34,12 +34,13 @@ from meshmode.discretization.connection import (  # noqa
 
 from pymbolic.primitives import (  # noqa
         cse_scope as cse_scope_base,
-        make_common_subexpression as cse, If, Comparison)
+        make_common_subexpression as cse, If, Comparison, Expression)
 from pymbolic.geometric_algebra import MultiVector
 from pytools.obj_array import join_fields, make_obj_array  # noqa
 
 
-class ExpressionBase(pymbolic.primitives.Expression):
+class GrudgeStringifiable(object):
+
     def stringifier(self):
         from grudge.symbolic.mappers import StringifyMapper
         return StringifyMapper
@@ -74,6 +75,7 @@ Symbols
 
 .. autoclass:: Variable
 .. autoclass:: ScalarVariable
+.. autoclass:: ExternalCall
 .. autoclass:: make_sym_array
 .. autoclass:: make_sym_mv
 .. autoclass:: CFunction
@@ -289,7 +291,16 @@ class HasDOFDesc(object):
         discretization on which this property is given.
     """
 
-    def __init__(self, dd):
+    def __init__(self, *args, **kwargs):
+        # The remaining arguments are passed to the chained superclass.
+
+        if "dd" in kwargs:
+            dd = kwargs.pop("dd")
+        else:
+            dd = args[-1]
+            args = args[:-1]
+
+        super(HasDOFDesc, self).__init__(*args, **kwargs)
         self.dd = dd
 
     def __getinitargs__(self):
@@ -298,9 +309,7 @@ class HasDOFDesc(object):
     def with_dd(self, dd):
         """Return a copy of *self*, modified to the given DOF descriptor.
         """
-        return type(self)(
-                *self.__getinitargs__()[:-1],
-                dd=dd or self.dd)
+        return type(self)(*self.__getinitargs__())
 
 # }}}
 
@@ -311,7 +320,7 @@ class cse_scope(cse_scope_base):  # noqa
     DISCRETIZATION = "grudge_discretization"
 
 
-class Variable(HasDOFDesc, ExpressionBase, pymbolic.primitives.Variable):
+class Variable(HasDOFDesc, GrudgeStringifiable, pymbolic.primitives.Variable):
     """A user-supplied input variable with a known :class:`DOFDesc`.
     """
     init_arg_names = ("name", "dd")
@@ -320,8 +329,7 @@ class Variable(HasDOFDesc, ExpressionBase, pymbolic.primitives.Variable):
         if dd is None:
             dd = DD_VOLUME
 
-        HasDOFDesc.__init__(self, dd)
-        pymbolic.primitives.Variable.__init__(self, name)
+        super(Variable, self).__init__(name, dd)
 
     def __getinitargs__(self):
         return (self.name, self.dd,)
@@ -337,6 +345,30 @@ class ScalarVariable(Variable):
         super(ScalarVariable, self).__init__(name, DD_SCALAR)
 
 
+class ExternalCall(HasDOFDesc, GrudgeStringifiable, pymbolic.primitives.Call):
+
+    init_arg_names = ("function", "parameters", "dd")
+
+    def __getinitargs__(self):
+        return (self.function, self.parameters, self.dd)
+
+    def get_hash(self):
+        # Object arrays are permitted as parameters, but are not hashable.
+        params = []
+        for param in self.parameters:
+            if isinstance(param, np.ndarray):
+                assert param.dtype == np.object
+                param = tuple(param)
+            params.append(param)
+
+        return hash(
+                (type(self).__name__, self.function)
+                + tuple(params)
+                + (self.dd,))
+
+    mapper_method = "map_external_call"
+
+
 def make_sym_array(name, shape, dd=None):
     def var_factory(name):
         return Variable(name, dd)
@@ -349,13 +381,10 @@ def make_sym_mv(name, dim, var_factory=None):
             make_sym_array(name, dim, var_factory))
 
 
-class CFunction(pymbolic.primitives.Variable):
+class CFunction(GrudgeStringifiable, pymbolic.primitives.Variable):
     """A symbol representing a C-level function, to be used as the function
     argument of :class:`pymbolic.primitives.Call`.
     """
-    def stringifier(self):
-        from grudge.symbolic.mappers import StringifyMapper
-        return StringifyMapper
 
     def __call__(self, *exprs):
         from pytools.obj_array import with_object_array_or_scalar_n_args
@@ -379,7 +408,7 @@ bessel_y = CFunction("bessel_y")
 
 # {{{ technical helpers
 
-class OperatorBinding(ExpressionBase):
+class OperatorBinding(GrudgeStringifiable, pymbolic.primitives.Expression):
     init_arg_names = ("op", "field")
 
     def __init__(self, op, field):
@@ -424,16 +453,16 @@ class PrioritizedSubexpression(pymbolic.primitives.CommonSubexpression):
 # }}}
 
 
-class Ones(ExpressionBase, HasDOFDesc):
+class Ones(GrudgeStringifiable, HasDOFDesc, Expression):
     def __getinitargs__(self):
-        return ()
+        return (self.dd,)
 
     mapper_method = intern("map_ones")
 
 
 # {{{ geometry data
 
-class DiscretizationProperty(ExpressionBase, HasDOFDesc):
+class DiscretizationProperty(GrudgeStringifiable, HasDOFDesc, Expression):
     pass
 
 
diff --git a/test/test_grudge.py b/test/test_grudge.py
index cf820f4c..3dfa3984 100644
--- a/test/test_grudge.py
+++ b/test/test_grudge.py
@@ -514,6 +514,35 @@ def test_bessel(ctx_factory):
     assert z < 1e-15
 
 
+def test_ExternalCall(ctx_factory):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+
+    def double(x):
+        return 2 * x
+
+    from meshmode.mesh.generation import generate_regular_rect_mesh
+
+    dims = 2
+
+    mesh = generate_regular_rect_mesh(a=(0,) * dims, b=(1,) * dims, n=(4,) * dims)
+    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=1)
+
+    ones = sym.Ones(sym.DD_VOLUME)
+    from pymbolic.primitives import Variable
+    op = (
+            ones * 3
+            + sym.ExternalCall(
+                Variable("double"),
+                (ones,),
+                sym.DD_VOLUME))
+
+    bound_op = bind(discr, op)
+
+    result = bound_op(queue, double=double)
+    assert (result == 5).get().all()
+
+
 # You can test individual routines by typing
 # $ python test_grudge.py 'test_routine()'
 
-- 
GitLab


From 56543a47350001241767c0fc9b19adea45d8187b Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Wed, 3 Apr 2019 19:41:17 -0500
Subject: [PATCH 02/35] Fix indentation

---
 grudge/symbolic/compiler.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/grudge/symbolic/compiler.py b/grudge/symbolic/compiler.py
index 2b25f0f0..ce896235 100644
--- a/grudge/symbolic/compiler.py
+++ b/grudge/symbolic/compiler.py
@@ -1271,14 +1271,14 @@ class OperatorCompiler(mappers.IdentityMapper):
 
     def map_external_call(self, expr, codegen_state):
         return self.assign_to_new_var(
-                    codegen_state,
-                    type(expr)(
-                        expr.function,
-                        [self.assign_to_new_var(
-                            codegen_state,
-                            self.rec(par, codegen_state))
-                            for par in expr.parameters],
-                        expr.dd))
+                codegen_state,
+                type(expr)(
+                    expr.function,
+                    [self.assign_to_new_var(
+                        codegen_state,
+                        self.rec(par, codegen_state))
+                        for par in expr.parameters],
+                    expr.dd))
 
     def map_call(self, expr, codegen_state):
         from grudge.symbolic.primitives import CFunction
@@ -1288,13 +1288,13 @@ class OperatorCompiler(mappers.IdentityMapper):
             # If it's not a C-level function, it shouldn't get muddled up into
             # a vector math expression.
             return self.assign_to_new_var(
-                        codegen_state,
-                        type(expr)(
-                            expr.function,
-                            [self.assign_to_new_var(
-                                codegen_state,
-                                self.rec(par, codegen_state))
-                                for par in expr.parameters]))
+                    codegen_state,
+                    type(expr)(
+                        expr.function,
+                        [self.assign_to_new_var(
+                            codegen_state,
+                            self.rec(par, codegen_state))
+                            for par in expr.parameters]))
 
     def map_ref_diff_op_binding(self, expr, codegen_state):
         try:
-- 
GitLab


From fecf4224691b5aadbe0e2ae163e301332effdb46 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Wed, 3 Apr 2019 19:43:37 -0500
Subject: [PATCH 03/35] flake8 fix

---
 test/test_grudge.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_grudge.py b/test/test_grudge.py
index 3dfa3984..115694f0 100644
--- a/test/test_grudge.py
+++ b/test/test_grudge.py
@@ -514,7 +514,7 @@ def test_bessel(ctx_factory):
     assert z < 1e-15
 
 
-def test_ExternalCall(ctx_factory):
+def test_ExternalCall(ctx_factory):  # noqa
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
 
-- 
GitLab


From 2f992e19d238739b9edc90fc37d2e86ac074d0cc Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Wed, 3 Apr 2019 19:45:01 -0500
Subject: [PATCH 04/35] Remove object array stuff (doesn't work)

---
 grudge/symbolic/primitives.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/grudge/symbolic/primitives.py b/grudge/symbolic/primitives.py
index 65676118..e9af1ee9 100644
--- a/grudge/symbolic/primitives.py
+++ b/grudge/symbolic/primitives.py
@@ -352,20 +352,6 @@ class ExternalCall(HasDOFDesc, GrudgeStringifiable, pymbolic.primitives.Call):
     def __getinitargs__(self):
         return (self.function, self.parameters, self.dd)
 
-    def get_hash(self):
-        # Object arrays are permitted as parameters, but are not hashable.
-        params = []
-        for param in self.parameters:
-            if isinstance(param, np.ndarray):
-                assert param.dtype == np.object
-                param = tuple(param)
-            params.append(param)
-
-        return hash(
-                (type(self).__name__, self.function)
-                + tuple(params)
-                + (self.dd,))
-
     mapper_method = "map_external_call"
 
 
-- 
GitLab


From 8fee177f22e5d83b7ca097604474cdeed8524783 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Wed, 3 Apr 2019 19:47:14 -0500
Subject: [PATCH 05/35] Document

---
 grudge/symbolic/primitives.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/grudge/symbolic/primitives.py b/grudge/symbolic/primitives.py
index e9af1ee9..b8fe6c5a 100644
--- a/grudge/symbolic/primitives.py
+++ b/grudge/symbolic/primitives.py
@@ -346,6 +346,8 @@ class ScalarVariable(Variable):
 
 
 class ExternalCall(HasDOFDesc, GrudgeStringifiable, pymbolic.primitives.Call):
+    """A call to a user-supplied function with a :class:`DOFDesc`.
+    """
 
     init_arg_names = ("function", "parameters", "dd")
 
-- 
GitLab


From 9ad13fd263e9f9759be4f6b7126e7543ef1bf002 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Thu, 4 Apr 2019 02:49:17 +0200
Subject: [PATCH 06/35] pymbolic.primitives.Expression -> Expression

---
 grudge/symbolic/primitives.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/grudge/symbolic/primitives.py b/grudge/symbolic/primitives.py
index b8fe6c5a..d7a3f585 100644
--- a/grudge/symbolic/primitives.py
+++ b/grudge/symbolic/primitives.py
@@ -396,7 +396,7 @@ bessel_y = CFunction("bessel_y")
 
 # {{{ technical helpers
 
-class OperatorBinding(GrudgeStringifiable, pymbolic.primitives.Expression):
+class OperatorBinding(GrudgeStringifiable, Expression):
     init_arg_names = ("op", "field")
 
     def __init__(self, op, field):
-- 
GitLab


From d1dee68bd739f7ab824e7bde4b2d1793d8d29aeb Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Wed, 3 Apr 2019 19:56:38 -0500
Subject: [PATCH 07/35] Redo the inheritance hierarchy to use ExpressionBase
 again

---
 grudge/symbolic/primitives.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/grudge/symbolic/primitives.py b/grudge/symbolic/primitives.py
index b8fe6c5a..571547fd 100644
--- a/grudge/symbolic/primitives.py
+++ b/grudge/symbolic/primitives.py
@@ -34,12 +34,12 @@ from meshmode.discretization.connection import (  # noqa
 
 from pymbolic.primitives import (  # noqa
         cse_scope as cse_scope_base,
-        make_common_subexpression as cse, If, Comparison, Expression)
+        make_common_subexpression as cse, If, Comparison)
 from pymbolic.geometric_algebra import MultiVector
 from pytools.obj_array import join_fields, make_obj_array  # noqa
 
 
-class GrudgeStringifiable(object):
+class ExpressionBase(pymbolic.primitives.Expression):
 
     def stringifier(self):
         from grudge.symbolic.mappers import StringifyMapper
@@ -320,7 +320,7 @@ class cse_scope(cse_scope_base):  # noqa
     DISCRETIZATION = "grudge_discretization"
 
 
-class Variable(HasDOFDesc, GrudgeStringifiable, pymbolic.primitives.Variable):
+class Variable(HasDOFDesc, ExpressionBase, pymbolic.primitives.Variable):
     """A user-supplied input variable with a known :class:`DOFDesc`.
     """
     init_arg_names = ("name", "dd")
@@ -345,7 +345,7 @@ class ScalarVariable(Variable):
         super(ScalarVariable, self).__init__(name, DD_SCALAR)
 
 
-class ExternalCall(HasDOFDesc, GrudgeStringifiable, pymbolic.primitives.Call):
+class ExternalCall(HasDOFDesc, ExpressionBase, pymbolic.primitives.Call):
     """A call to a user-supplied function with a :class:`DOFDesc`.
     """
 
@@ -369,7 +369,7 @@ def make_sym_mv(name, dim, var_factory=None):
             make_sym_array(name, dim, var_factory))
 
 
-class CFunction(GrudgeStringifiable, pymbolic.primitives.Variable):
+class CFunction(ExpressionBase, pymbolic.primitives.Variable):
     """A symbol representing a C-level function, to be used as the function
     argument of :class:`pymbolic.primitives.Call`.
     """
@@ -396,7 +396,7 @@ bessel_y = CFunction("bessel_y")
 
 # {{{ technical helpers
 
-class OperatorBinding(GrudgeStringifiable, pymbolic.primitives.Expression):
+class OperatorBinding(ExpressionBase):
     init_arg_names = ("op", "field")
 
     def __init__(self, op, field):
@@ -441,16 +441,13 @@ class PrioritizedSubexpression(pymbolic.primitives.CommonSubexpression):
 # }}}
 
 
-class Ones(GrudgeStringifiable, HasDOFDesc, Expression):
-    def __getinitargs__(self):
-        return (self.dd,)
-
+class Ones(HasDOFDesc, ExpressionBase):
     mapper_method = intern("map_ones")
 
 
 # {{{ geometry data
 
-class DiscretizationProperty(GrudgeStringifiable, HasDOFDesc, Expression):
+class DiscretizationProperty(HasDOFDesc, ExpressionBase):
     pass
 
 
-- 
GitLab


From f5c141a2b51a88c12200cfbf2da6e3988307bce5 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Wed, 3 Apr 2019 20:04:15 -0500
Subject: [PATCH 08/35] Fix whitespace changes

---
 grudge/symbolic/compiler.py   | 1 +
 grudge/symbolic/primitives.py | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/grudge/symbolic/compiler.py b/grudge/symbolic/compiler.py
index ce896235..4a8ee89e 100644
--- a/grudge/symbolic/compiler.py
+++ b/grudge/symbolic/compiler.py
@@ -1287,6 +1287,7 @@ class OperatorCompiler(mappers.IdentityMapper):
         else:
             # If it's not a C-level function, it shouldn't get muddled up into
             # a vector math expression.
+
             return self.assign_to_new_var(
                     codegen_state,
                     type(expr)(
diff --git a/grudge/symbolic/primitives.py b/grudge/symbolic/primitives.py
index 571547fd..d35da7c0 100644
--- a/grudge/symbolic/primitives.py
+++ b/grudge/symbolic/primitives.py
@@ -40,7 +40,6 @@ from pytools.obj_array import join_fields, make_obj_array  # noqa
 
 
 class ExpressionBase(pymbolic.primitives.Expression):
-
     def stringifier(self):
         from grudge.symbolic.mappers import StringifyMapper
         return StringifyMapper
-- 
GitLab


From ef3fe7a06d70355139491a8516f6b006e7f55958 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Thu, 4 Apr 2019 00:40:38 -0500
Subject: [PATCH 09/35] Add initial fusion study code.

---
 examples/dagrt_fusion/fusion-study.py | 454 ++++++++++++++++++++++++++
 grudge/symbolic/compiler.py           |  24 +-
 grudge/symbolic/mappers/__init__.py   |  34 ++
 3 files changed, 501 insertions(+), 11 deletions(-)
 create mode 100644 examples/dagrt_fusion/fusion-study.py

diff --git a/examples/dagrt_fusion/fusion-study.py b/examples/dagrt_fusion/fusion-study.py
new file mode 100644
index 00000000..d37dca94
--- /dev/null
+++ b/examples/dagrt_fusion/fusion-study.py
@@ -0,0 +1,454 @@
+from __future__ import division, print_function
+
+__copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import logging
+import numpy as np
+import pyopencl as cl
+
+import dagrt.language as lang
+import pymbolic.primitives as p
+import grudge.symbolic.mappers as gmap
+from pymbolic.mapper.evaluator import EvaluationMapper \
+        as PymbolicEvaluationMapper
+
+from grudge import sym, bind, DGDiscretizationWithBoundaries
+from leap.rk import LSRK4Method
+
+
+logging.basicConfig(level=logging.INFO)
+
+logger = logging.getLogger(__name__)
+
+
+# {{{ topological sort
+
+def topological_sort(stmts, root_deps):
+    id_to_stmt = {stmt.id: stmt for stmt in stmts}
+
+    ordered_stmts = []
+    satisfied = set()
+
+    def satisfy_dep(name):
+        if name in satisfied:
+            return
+
+        stmt = id_to_stmt[name]
+        for dep in stmt.depends_on:
+            satisfy_dep(dep)
+        ordered_stmts.append(stmt)
+        satisfied.add(name)
+
+    for d in root_deps:
+        satisfy_dep(d)
+
+    return ordered_stmts
+
+# }}}
+
+
+# Use evaluation, not identity mappers to propagate symbolic vectors to
+# outermost level.
+
+class DagrtToGrudgeRewriter(PymbolicEvaluationMapper):
+    def __init__(self, context):
+        self.context = context
+
+    def map_variable(self, expr):
+        return self.context[expr.name]
+
+    def map_call(self, expr):
+        raise ValueError("function call not expected")
+
+
+class GrudgeArgSubstitutor(gmap.SymbolicEvaluator):
+    def __init__(self, args):
+        super().__init__(context={})
+        self.args = args
+
+    def map_grudge_variable(self, expr):
+        if expr.name in self.args:
+            return self.args[expr.name]
+        else:
+            return super().map_variable(expr)
+
+
+def transcribe_phase(dag, field_var_name, field_components, phase_name,
+                     sym_operator):
+    sym_operator = gmap.OperatorBinder()(sym_operator)
+    phase = dag.phases[phase_name]
+
+    ctx = {
+            "<t>": sym.var("input_t", sym.DD_SCALAR),
+            "<dt>": sym.var("input_dt", sym.DD_SCALAR),
+            f"<state>{field_var_name}": sym.make_sym_array(
+                f"input_{field_var_name}", field_components),
+            f"<p>residual": sym.make_sym_array(
+                "input_residual", field_components),
+    }
+
+    rhs_name = f"<func>{field_var_name}"
+    output_vars = [v for v in ctx]
+    yielded_states = []
+
+    from dagrt.codegen.transform import isolate_function_calls_in_phase
+    ordered_stmts = topological_sort(
+            isolate_function_calls_in_phase(
+                phase,
+                dag.get_stmt_id_generator(),
+                dag.get_var_name_generator()).statements,
+            phase.depends_on)
+
+    for stmt in ordered_stmts:
+        if stmt.condition is not True:
+            raise NotImplementedError(
+                "non-True condition (in statement '%s') not supported"
+                % stmt.id)
+
+        if isinstance(stmt, lang.Nop):
+            pass
+
+        elif isinstance(stmt, lang.AssignExpression):
+            if not isinstance(stmt.lhs, p.Variable):
+                raise NotImplementedError("lhs of statement %s is not a variable: %s"
+                        % (stmt.id, stmt.lhs))
+            ctx[stmt.lhs.name] = sym.cse(
+                DagrtToGrudgeRewriter(ctx)(stmt.rhs),
+                (
+                    stmt.lhs.name
+                    .replace("<", "")
+                    .replace(">", "")))
+
+        elif isinstance(stmt, lang.AssignFunctionCall):
+            if stmt.function_id != rhs_name:
+                raise NotImplementedError(
+                        "statement '%s' calls unsupported function '%s'"
+                        % (stmt.id, stmt.function_id))
+
+            if stmt.parameters:
+                raise NotImplementedError(
+                    "statement '%s' calls function '%s' with positional arguments"
+                    % (stmt.id, stmt.function_id))
+
+            kwargs = {name: sym.cse(DagrtToGrudgeRewriter(ctx)(arg))
+                      for name, arg in stmt.kw_parameters.items()}
+
+            if len(stmt.assignees) != 1:
+                raise NotImplementedError(
+                    "statement '%s' calls function '%s' "
+                    "with more than one LHS"
+                    % (stmt.id, stmt.function_id))
+
+            assignee, = stmt.assignees
+            ctx[assignee] = GrudgeArgSubstitutor(kwargs)(sym_operator)
+
+        elif isinstance(stmt, lang.YieldState):
+            d2g = DagrtToGrudgeRewriter(ctx)
+            yielded_states.append(
+                (stmt.time_id, d2g(stmt.time), stmt.component_id,
+                    d2g(stmt.expression)))
+
+        else:
+            raise NotImplementedError("statement %s is of unsupported type ''%s'"
+                        % (stmt.id, type(stmt).__name__))
+
+    return output_vars, [ctx[ov] for ov in output_vars], yielded_states
+
+
+def get_strong_wave_op_with_discr(cl_ctx, dims=3, order=4):
+    from meshmode.mesh.generation import generate_regular_rect_mesh
+    mesh = generate_regular_rect_mesh(
+            a=(-0.5,)*dims,
+            b=(0.5,)*dims,
+            n=(16,)*dims)
+
+    logger.info("%d elements" % mesh.nelements)
+
+    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order)
+
+    source_center = np.array([0.1, 0.22, 0.33])[:dims]
+    source_width = 0.05
+    source_omega = 3
+
+    sym_x = sym.nodes(mesh.dim)
+    sym_source_center_dist = sym_x - source_center
+    sym_t = sym.ScalarVariable("t")
+
+    from grudge.models.wave import StrongWaveOperator
+    from meshmode.mesh import BTAG_ALL, BTAG_NONE
+    op = StrongWaveOperator(-0.1, dims,
+            source_f=(
+                sym.sin(source_omega*sym_t)
+                * sym.exp(
+                    -np.dot(sym_source_center_dist, sym_source_center_dist)
+                    / source_width**2)),
+            dirichlet_tag=BTAG_NONE,
+            neumann_tag=BTAG_NONE,
+            radiation_tag=BTAG_ALL,
+            flux_type="upwind")
+
+    op.check_bc_coverage(mesh)
+
+    return (op, discr)
+
+
+class RK4TimeStepperBase(object):
+
+    def get_initial_context(self, fields, t_start, dt):
+        from pytools.obj_array import join_fields
+
+        # Flatten fields.
+        flattened_fields = []
+        for field in fields:
+            if isinstance(field, list):
+                flattened_fields.extend(field)
+            else:
+                flattened_fields.append(field)
+        flattened_fields = join_fields(*flattened_fields)
+        del fields
+
+        return {
+                "input_t": t_start,
+                "input_dt": dt,
+                self.state_name: flattened_fields,
+                "input_residual": flattened_fields,
+        }
+
+    def set_up_stepper(self, discr, field_var_name, sym_rhs, num_fields):
+        dt_method = LSRK4Method(component_id=field_var_name)
+        dt_code = dt_method.generate()
+        self.field_var_name = field_var_name
+        self.state_name = f"input_{field_var_name}"
+
+        # Transcribe the phase.
+        output_vars, results, yielded_states = transcribe_phase(
+                dt_code, field_var_name, num_fields,
+                "primary", sym_rhs)
+
+        # Build the bound operator for the time integrator.
+        output_t = results[0]
+        output_dt = results[1]
+        output_states = results[2]
+        output_residuals = results[3]
+
+        assert len(output_states) == num_fields
+        assert len(output_states) == len(output_residuals)
+
+        from pytools.obj_array import join_fields
+        flattened_results = join_fields(output_t, output_dt, *output_states)
+
+        self.bound_op = bind(discr, flattened_results)
+
+
+class RK4TimeStepper(RK4TimeStepperBase):
+
+    def __init__(self, queue, discr, field_var_name, grudge_bound_op,
+                 num_fields, component_getter):
+        from pymbolic import var
+
+        # Construct sym_rhs to have the effect of replacing the RHS calls in the
+        # dagrt code with calls of the grudge operator.
+        from grudge.symbolic.primitives import ExternalCall, Variable
+        call = sym.cse(ExternalCall(
+                var("grudge_op"),
+                (
+                    (Variable("t", dd=sym.DD_SCALAR),)
+                    + tuple(
+                        Variable(field_var_name, dd=sym.DD_VOLUME)[i]
+                        for i in range(num_fields))),
+                dd=sym.DD_VOLUME))
+
+        from pytools.obj_array import join_fields
+        sym_rhs = join_fields(*(call[i] for i in range(num_fields)))
+
+        self.queue = queue
+        self.grudge_bound_op = grudge_bound_op
+        self.set_up_stepper(discr, field_var_name, sym_rhs, num_fields)
+        self.component_getter = component_getter
+
+    def _bound_op(self, t, *args):
+        from pytools.obj_array import join_fields
+        context = {
+                "t": t,
+                self.field_var_name: join_fields(*args)}
+        return self.grudge_bound_op(self.queue, **context)
+
+    def get_initial_context(self, fields, t_start, dt):
+        context = super().get_initial_context(fields, t_start, dt)
+        context["grudge_op"] = self._bound_op
+        return context
+
+    def run(self, fields, t_start, dt, t_end):
+        context = self.get_initial_context(fields, t_start, dt)
+
+        t = t_start
+
+        while t <= t_end:
+            results = self.bound_op(self.queue, **context)
+            t = results[0]
+            context["input_t"] = t
+            context["input_dt"] = results[1]
+            output_states = results[2:]
+            context[self.state_name] = output_states
+            yield (t, self.component_getter(output_states))
+
+
+class FusedRK4TimeStepper(RK4TimeStepperBase):
+
+    def __init__(self, queue, discr, field_var_name, sym_rhs, num_fields,
+                 component_getter):
+        self.queue = queue
+        self.set_up_stepper(discr, field_var_name, sym_rhs, num_fields)
+        self.component_getter = component_getter
+
+    def run(self, fields, t_start, dt, t_end):
+        context = self.get_initial_context(fields, t_start, dt)
+
+        t = t_start
+
+        while t <= t_end:
+            results = self.bound_op(self.queue, **context)
+            t = results[0]
+            context["input_t"] = t
+            context["input_dt"] = results[1]
+            output_states = results[2:]
+            context[self.state_name] = output_states
+            yield (t, self.component_getter(output_states))
+
+
+class FusedGrudgeRK4TimeStepper(object):
+
+    def __init__(self, queue, field_var_name, dt, fields, sym_rhs, discr,
+                 component_getter, t_start=0):
+        self.t_start = t_start
+        self.dt = dt
+        dt_method = LSRK4Method(component_id=field_var_name)
+        dt_code = dt_method.generate()
+
+        from pytools.obj_array import join_fields
+
+        # Flatten fields.
+        flattened_fields = []
+        for field in fields:
+            if isinstance(field, list):
+                flattened_fields.extend(field)
+            else:
+                flattened_fields.append(field)
+        flattened_fields = join_fields(*flattened_fields)
+        del fields
+
+        output_vars, results, yielded_states = transcribe_phase(
+                dt_code, field_var_name, len(flattened_fields),
+                "primary", sym_rhs)
+
+        output_t = results[0]
+        output_dt = results[1]
+        output_states = results[2]
+        output_residuals = results[3]
+
+        assert len(output_states) == len(flattened_fields)
+        assert len(output_states) == len(output_residuals)
+
+        flattened_results = join_fields(output_t, output_dt, *output_states)
+        self.bound_op = bind(discr, flattened_results)
+        self.queue = queue
+
+        self.initial_context = {
+                "input_t": t_start,
+                "input_dt": dt,
+                self.state_name: flattened_fields,
+                "input_residual": flattened_fields,
+        }
+
+        self.component_getter = component_getter
+
+    def run(self, t_end):
+        t = self.t_start
+        context = self.initial_context.copy()
+
+        while t <= t_end:
+            results = self.bound_op(self.queue, **context)
+            t = results[0]
+            context["input_t"] = t
+            context["input_dt"] = results[1]
+            output_states = results[2:]
+            context[self.state_name] = output_states
+            yield (t, self.component_getter(output_states))
+
+
+def get_strong_wave_component(state_component):
+    return (state_component[0], state_component[1:])
+
+
+# {{{ equivalence check
+
+def test_stepper_equivalence(order=4):
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+
+    dims = 2
+
+    op, discr = get_strong_wave_op_with_discr(cl_ctx, dims=dims, order=order)
+
+    if dims == 2:
+        dt = 0.04
+    elif dims == 3:
+        dt = 0.02
+
+    from pytools.obj_array import join_fields
+    ic = join_fields(discr.zeros(queue),
+            [discr.zeros(queue) for i in range(discr.dim)])
+
+    bound_op = bind(discr, op.sym_operator())
+
+    stepper = RK4TimeStepper(
+            queue, discr, "w", bound_op, 1 + discr.dim, get_strong_wave_component)
+
+    fused_stepper = FusedRK4TimeStepper(
+            queue, discr, "w", op.sym_operator(), 1 + discr.dim,
+            get_strong_wave_component)
+
+    t_start = 0
+    t_end = 0.5
+    nsteps = int(t_end/dt)
+    print("dt=%g nsteps=%d" % (dt, nsteps))
+
+    step = 0
+
+    norm = bind(discr, sym.norm(2, sym.var("u_ref") - sym.var("u")))
+
+    fused_steps = fused_stepper.run(ic, t_start, dt, t_end)
+
+    for t_ref, (u_ref, v_ref) in stepper.run(ic, t_start, dt, t_end):
+        step += 1
+        logger.info("step %d", step)
+        t, (u, v) = next(fused_steps)
+        assert t == t_ref, step
+        assert norm(queue, u=u, u_ref=u_ref) <= 1e-13, step
+
+# }}}
+
+
+if __name__ == "__main__":
+    test_stepper_equivalence()
diff --git a/grudge/symbolic/compiler.py b/grudge/symbolic/compiler.py
index 4a8ee89e..39313043 100644
--- a/grudge/symbolic/compiler.py
+++ b/grudge/symbolic/compiler.py
@@ -106,10 +106,13 @@ class LoopyKernelInstruction(Instruction):
     @memoize_method
     def get_dependencies(self):
         from pymbolic.primitives import Variable, Subscript
-        return set(
-                v
-                for v in self.kernel_descriptor.input_mappings.values()
-                if isinstance(v, (Variable, Subscript)))
+        result = set()
+        for v in self.kernel_descriptor.input_mappings.values():
+            if isinstance(v, Variable):
+                result.add(v)
+            elif isinstance(v, Subscript):
+                result.add(v.aggregate)
+        return result
 
     def __str__(self):
         knl_str = "\n".join(
@@ -580,10 +583,6 @@ class Code(object):
                             break
 
         if len(done_insns) < len(self.instructions):
-            print("Unreachable instructions:")
-            for insn in set(self.instructions) - done_insns:
-                print("    ", insn)
-
             raise RuntimeError("not all instructions are reachable"
                     "--did you forget to pass a value for a placeholder?")
 
@@ -1152,11 +1151,14 @@ class OperatorCompiler(mappers.IdentityMapper):
         from grudge.symbolic.dofdesc_inference import DOFDescInferenceMapper
         inf_mapper = DOFDescInferenceMapper(discr_code + eval_code)
 
-        eval_code = aggregate_assignments(
+        if 1:
+            eval_code = aggregate_assignments(
                 inf_mapper, eval_code, result, self.max_vectors_in_batch_expr)
 
-        discr_code = rewrite_insn_to_loopy_insns(inf_mapper, discr_code)
-        eval_code = rewrite_insn_to_loopy_insns(inf_mapper, eval_code)
+            discr_code = rewrite_insn_to_loopy_insns(inf_mapper, discr_code)
+            eval_code = rewrite_insn_to_loopy_insns(inf_mapper, eval_code)
+
+        code = Code(eval_code, result)
 
         from pytools.obj_array import make_obj_array
         return (
diff --git a/grudge/symbolic/mappers/__init__.py b/grudge/symbolic/mappers/__init__.py
index d1b70475..05205d27 100644
--- a/grudge/symbolic/mappers/__init__.py
+++ b/grudge/symbolic/mappers/__init__.py
@@ -1300,6 +1300,40 @@ class FluxExchangeCollector(CSECachingMapperMixin, CollectorMixin, CombineMapper
 class Evaluator(pymbolic.mapper.evaluator.EvaluationMapper):
     pass
 
+
+class SymbolicEvaluator(pymbolic.mapper.evaluator.EvaluationMapper):
+    def map_operator_binding(self, expr, *args, **kwargs):
+        return expr.op(self.rec(expr.field, *args, **kwargs))
+
+    def map_node_coordinate_component(self, expr, *args, **kwargs):
+        return expr
+
+    def map_call(self, expr, *args, **kwargs):
+        return type(expr)(
+                expr.function,
+                tuple(self.rec(child, *args, **kwargs)
+                    for child in expr.parameters))
+
+    def map_call_with_kwargs(self, expr, *args, **kwargs):
+        return type(expr)(
+                expr.function,
+                tuple(self.rec(child, *args, **kwargs)
+                    for child in expr.parameters),
+                dict(
+                    (key, self.rec(val, *args, **kwargs))
+                    for key, val in six.iteritems(expr.kw_parameters))
+                    )
+
+    def map_external_call(self, expr, *args, **kwargs):
+        return type(expr)(
+                expr.function,
+                tuple(self.rec(child, *args, **kwargs)
+                    for child in expr.parameters),
+                expr.dd)
+
+    def map_common_subexpression(self, expr):
+        return type(expr)(self.rec(expr.child), expr.prefix, expr.scope)
+
 # }}}
 
 
-- 
GitLab


From 326ea9b52be988811021fe59542c4a17e7bf795f Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Thu, 4 Apr 2019 00:47:21 -0500
Subject: [PATCH 10/35] Kill dead code

---
 examples/dagrt_fusion/fusion-study.py | 63 +--------------------------
 1 file changed, 1 insertion(+), 62 deletions(-)

diff --git a/examples/dagrt_fusion/fusion-study.py b/examples/dagrt_fusion/fusion-study.py
index d37dca94..ce12f169 100644
--- a/examples/dagrt_fusion/fusion-study.py
+++ b/examples/dagrt_fusion/fusion-study.py
@@ -90,8 +90,7 @@ class GrudgeArgSubstitutor(gmap.SymbolicEvaluator):
     def map_grudge_variable(self, expr):
         if expr.name in self.args:
             return self.args[expr.name]
-        else:
-            return super().map_variable(expr)
+        return super().map_variable(expr)
 
 
 def transcribe_phase(dag, field_var_name, field_components, phase_name,
@@ -337,66 +336,6 @@ class FusedRK4TimeStepper(RK4TimeStepperBase):
             yield (t, self.component_getter(output_states))
 
 
-class FusedGrudgeRK4TimeStepper(object):
-
-    def __init__(self, queue, field_var_name, dt, fields, sym_rhs, discr,
-                 component_getter, t_start=0):
-        self.t_start = t_start
-        self.dt = dt
-        dt_method = LSRK4Method(component_id=field_var_name)
-        dt_code = dt_method.generate()
-
-        from pytools.obj_array import join_fields
-
-        # Flatten fields.
-        flattened_fields = []
-        for field in fields:
-            if isinstance(field, list):
-                flattened_fields.extend(field)
-            else:
-                flattened_fields.append(field)
-        flattened_fields = join_fields(*flattened_fields)
-        del fields
-
-        output_vars, results, yielded_states = transcribe_phase(
-                dt_code, field_var_name, len(flattened_fields),
-                "primary", sym_rhs)
-
-        output_t = results[0]
-        output_dt = results[1]
-        output_states = results[2]
-        output_residuals = results[3]
-
-        assert len(output_states) == len(flattened_fields)
-        assert len(output_states) == len(output_residuals)
-
-        flattened_results = join_fields(output_t, output_dt, *output_states)
-        self.bound_op = bind(discr, flattened_results)
-        self.queue = queue
-
-        self.initial_context = {
-                "input_t": t_start,
-                "input_dt": dt,
-                self.state_name: flattened_fields,
-                "input_residual": flattened_fields,
-        }
-
-        self.component_getter = component_getter
-
-    def run(self, t_end):
-        t = self.t_start
-        context = self.initial_context.copy()
-
-        while t <= t_end:
-            results = self.bound_op(self.queue, **context)
-            t = results[0]
-            context["input_t"] = t
-            context["input_dt"] = results[1]
-            output_states = results[2:]
-            context[self.state_name] = output_states
-            yield (t, self.component_getter(output_states))
-
-
 def get_strong_wave_component(state_component):
     return (state_component[0], state_component[1:])
 
-- 
GitLab


From 281a5a85b3aeade1a2d5de069810b178a14af1c8 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Thu, 4 Apr 2019 18:35:43 -0500
Subject: [PATCH 11/35] Get it working, sort of

---
 examples/dagrt_fusion/fusion-study.py | 290 +++++++++++++++++++-------
 1 file changed, 219 insertions(+), 71 deletions(-)

diff --git a/examples/dagrt_fusion/fusion-study.py b/examples/dagrt_fusion/fusion-study.py
index ce12f169..4c451ef6 100644
--- a/examples/dagrt_fusion/fusion-study.py
+++ b/examples/dagrt_fusion/fusion-study.py
@@ -25,11 +25,14 @@ THE SOFTWARE.
 
 import logging
 import numpy as np
+import six
 import pyopencl as cl
+import pyopencl.array  # noqa
 
 import dagrt.language as lang
 import pymbolic.primitives as p
 import grudge.symbolic.mappers as gmap
+from grudge.execution import ExecutionMapper
 from pymbolic.mapper.evaluator import EvaluationMapper \
         as PymbolicEvaluationMapper
 
@@ -68,6 +71,8 @@ def topological_sort(stmts, root_deps):
 # }}}
 
 
+# {{{ leap to grudge translation
+
 # Use evaluation, not identity mappers to propagate symbolic vectors to
 # outermost level.
 
@@ -174,43 +179,10 @@ def transcribe_phase(dag, field_var_name, field_components, phase_name,
 
     return output_vars, [ctx[ov] for ov in output_vars], yielded_states
 
+# }}}
 
-def get_strong_wave_op_with_discr(cl_ctx, dims=3, order=4):
-    from meshmode.mesh.generation import generate_regular_rect_mesh
-    mesh = generate_regular_rect_mesh(
-            a=(-0.5,)*dims,
-            b=(0.5,)*dims,
-            n=(16,)*dims)
-
-    logger.info("%d elements" % mesh.nelements)
-
-    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order)
-
-    source_center = np.array([0.1, 0.22, 0.33])[:dims]
-    source_width = 0.05
-    source_omega = 3
-
-    sym_x = sym.nodes(mesh.dim)
-    sym_source_center_dist = sym_x - source_center
-    sym_t = sym.ScalarVariable("t")
-
-    from grudge.models.wave import StrongWaveOperator
-    from meshmode.mesh import BTAG_ALL, BTAG_NONE
-    op = StrongWaveOperator(-0.1, dims,
-            source_f=(
-                sym.sin(source_omega*sym_t)
-                * sym.exp(
-                    -np.dot(sym_source_center_dist, sym_source_center_dist)
-                    / source_width**2)),
-            dirichlet_tag=BTAG_NONE,
-            neumann_tag=BTAG_NONE,
-            radiation_tag=BTAG_ALL,
-            flux_type="upwind")
-
-    op.check_bc_coverage(mesh)
-
-    return (op, discr)
 
+# {{{ time integrator implementations
 
 class RK4TimeStepperBase(object):
 
@@ -234,7 +206,8 @@ class RK4TimeStepperBase(object):
                 "input_residual": flattened_fields,
         }
 
-    def set_up_stepper(self, discr, field_var_name, sym_rhs, num_fields):
+    def set_up_stepper(self, discr, field_var_name, sym_rhs, num_fields,
+                       exec_mapper_factory=ExecutionMapper):
         dt_method = LSRK4Method(component_id=field_var_name)
         dt_code = dt_method.generate()
         self.field_var_name = field_var_name
@@ -257,13 +230,45 @@ class RK4TimeStepperBase(object):
         from pytools.obj_array import join_fields
         flattened_results = join_fields(output_t, output_dt, *output_states)
 
-        self.bound_op = bind(discr, flattened_results)
+        self.bound_op = bind(
+                discr, flattened_results, exec_mapper_factory=exec_mapper_factory)
+
+    def run(self, fields, t_start, dt, t_end, return_profile_data=False):
+        context = self.get_initial_context(fields, t_start, dt)
+
+        t = t_start
+
+        while t <= t_end:
+            if return_profile_data:
+                profile_data = dict()
+            else:
+                profile_data = None
+
+            results = self.bound_op(
+                    self.queue,
+                    profile_data=profile_data,
+                    **context)
+
+            if return_profile_data:
+                results = results[0]
+
+            t = results[0]
+            context["input_t"] = t
+            context["input_dt"] = results[1]
+            output_states = results[2:]
+            context[self.state_name] = output_states
+
+            result = (t, self.component_getter(output_states))
+            if return_profile_data:
+                result += (profile_data,)
+
+            yield result
 
 
 class RK4TimeStepper(RK4TimeStepperBase):
 
     def __init__(self, queue, discr, field_var_name, grudge_bound_op,
-                 num_fields, component_getter):
+                 num_fields, component_getter, exec_mapper_factory=ExecutionMapper):
         from pymbolic import var
 
         # Construct sym_rhs to have the effect of replacing the RHS calls in the
@@ -283,62 +288,82 @@ class RK4TimeStepper(RK4TimeStepperBase):
 
         self.queue = queue
         self.grudge_bound_op = grudge_bound_op
-        self.set_up_stepper(discr, field_var_name, sym_rhs, num_fields)
+        self.set_up_stepper(discr, field_var_name, sym_rhs, num_fields, exec_mapper_factory)
         self.component_getter = component_getter
 
-    def _bound_op(self, t, *args):
+    def _bound_op(self, t, *args, profile_data=None):
         from pytools.obj_array import join_fields
         context = {
                 "t": t,
                 self.field_var_name: join_fields(*args)}
-        return self.grudge_bound_op(self.queue, **context)
+        result = self.grudge_bound_op(
+                self.queue, profile_data=profile_data, **context)
+        if profile_data is not None:
+            result = result[0]
+        return result
 
     def get_initial_context(self, fields, t_start, dt):
         context = super().get_initial_context(fields, t_start, dt)
         context["grudge_op"] = self._bound_op
         return context
 
-    def run(self, fields, t_start, dt, t_end):
-        context = self.get_initial_context(fields, t_start, dt)
-
-        t = t_start
-
-        while t <= t_end:
-            results = self.bound_op(self.queue, **context)
-            t = results[0]
-            context["input_t"] = t
-            context["input_dt"] = results[1]
-            output_states = results[2:]
-            context[self.state_name] = output_states
-            yield (t, self.component_getter(output_states))
-
 
 class FusedRK4TimeStepper(RK4TimeStepperBase):
 
     def __init__(self, queue, discr, field_var_name, sym_rhs, num_fields,
-                 component_getter):
+                 component_getter, exec_mapper_factory=ExecutionMapper):
         self.queue = queue
-        self.set_up_stepper(discr, field_var_name, sym_rhs, num_fields)
+        self.set_up_stepper(
+                discr, field_var_name, sym_rhs, num_fields, exec_mapper_factory)
         self.component_getter = component_getter
 
-    def run(self, fields, t_start, dt, t_end):
-        context = self.get_initial_context(fields, t_start, dt)
+# }}}
 
-        t = t_start
 
-        while t <= t_end:
-            results = self.bound_op(self.queue, **context)
-            t = results[0]
-            context["input_t"] = t
-            context["input_dt"] = results[1]
-            output_states = results[2:]
-            context[self.state_name] = output_states
-            yield (t, self.component_getter(output_states))
+# {{{ problem setup code
+
+def get_strong_wave_op_with_discr(cl_ctx, dims=3, order=4):
+    from meshmode.mesh.generation import generate_regular_rect_mesh
+    mesh = generate_regular_rect_mesh(
+            a=(-0.5,)*dims,
+            b=(0.5,)*dims,
+            n=(16,)*dims)
+
+    logger.info("%d elements" % mesh.nelements)
+
+    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order)
+
+    source_center = np.array([0.1, 0.22, 0.33])[:dims]
+    source_width = 0.05
+    source_omega = 3
+
+    sym_x = sym.nodes(mesh.dim)
+    sym_source_center_dist = sym_x - source_center
+    sym_t = sym.ScalarVariable("t")
+
+    from grudge.models.wave import StrongWaveOperator
+    from meshmode.mesh import BTAG_ALL, BTAG_NONE
+    op = StrongWaveOperator(-0.1, dims,
+            source_f=(
+                sym.sin(source_omega*sym_t)
+                * sym.exp(
+                    -np.dot(sym_source_center_dist, sym_source_center_dist)
+                    / source_width**2)),
+            dirichlet_tag=BTAG_NONE,
+            neumann_tag=BTAG_NONE,
+            radiation_tag=BTAG_ALL,
+            flux_type="upwind")
+
+    op.check_bc_coverage(mesh)
+
+    return (op, discr)
 
 
 def get_strong_wave_component(state_component):
     return (state_component[0], state_component[1:])
 
+# }}}
+
 
 # {{{ equivalence check
 
@@ -370,7 +395,7 @@ def test_stepper_equivalence(order=4):
 
     t_start = 0
     t_end = 0.5
-    nsteps = int(t_end/dt)
+    nsteps = int(np.ceil((t_end + 1e-9) / dt))
     print("dt=%g nsteps=%d" % (dt, nsteps))
 
     step = 0
@@ -381,7 +406,7 @@ def test_stepper_equivalence(order=4):
 
     for t_ref, (u_ref, v_ref) in stepper.run(ic, t_start, dt, t_end):
         step += 1
-        logger.info("step %d", step)
+        logger.info("step %d/%d", step, nsteps)
         t, (u, v) = next(fused_steps)
         assert t == t_ref, step
         assert norm(queue, u=u, u_ref=u_ref) <= 1e-13, step
@@ -389,5 +414,128 @@ def test_stepper_equivalence(order=4):
 # }}}
 
 
+# {{{ mem op counter implementation
+
+class MemOpCountingExecutionMapper(ExecutionMapper):
+
+    def __init__(self, queue, context, bound_op):
+        super().__init__(queue, context, bound_op)
+
+    # {{{ expression mappings
+
+    def map_common_subexpression(self, expr):
+        raise ValueError("CSE not expected")
+
+    def map_profiled_external_call(self, expr, profile_data):
+        from pymbolic.primitives import Variable
+        assert isinstance(expr.function, Variable)
+        args = [self.rec(p) for p in expr.parameters]
+        return self.context[expr.function.name](*args, profile_data=profile_data)
+
+    # }}}
+        
+    # {{{ instruction mappings
+    
+    def map_insn_assign(self, insn, profile_data):
+        result = []
+        for name, expr in zip(insn.names, insn.exprs):
+            if isinstance(expr, sym.ExternalCall):
+                assert expr.mapper_method == "map_external_call"
+                val = self.map_profiled_external_call(expr, profile_data)
+            else:
+                val = self.rec(expr)
+            result.append((name, val))
+        return result, []
+
+    def map_insn_loopy_kernel(self, insn, profile_data):
+        kwargs = {}
+        kdescr = insn.kernel_descriptor
+        for name, expr in six.iteritems(kdescr.input_mappings):
+            val = self.rec(expr)
+            kwargs[name] = val
+            assert not isinstance(val, np.ndarray)
+            if profile_data is not None and isinstance(val, pyopencl.array.Array):
+                profile_data["bytes_read"] = (
+                        profile_data.get("bytes_read", 0) + val.nbytes)
+
+        discr = self.discrwb.discr_from_dd(kdescr.governing_dd)
+        for name in kdescr.scalar_args():
+            v = kwargs[name]
+            if isinstance(v, (int, float)):
+                kwargs[name] = discr.real_dtype.type(v)
+            elif isinstance(v, complex):
+                kwargs[name] = discr.complex_dtype.type(v)
+            elif isinstance(v, np.number):
+                pass
+            else:
+                raise ValueError("unrecognized scalar type for variable '%s': %s"
+                        % (name, type(v)))
+
+        kwargs["grdg_n"] = discr.nnodes
+        evt, result_dict = kdescr.loopy_kernel(self.queue, **kwargs)
+
+        for val in result_dict.values():
+            assert not isinstance(val, np.ndarray)
+            if profile_data is not None and isinstance(val, pyopencl.array.Array):
+                profile_data["bytes_written"] = (
+                        profile_data.get("bytes_written", 0) + val.nbytes)
+
+        return list(result_dict.items()), []
+
+    # }}}
+
+# }}}
+
+
+# {{{ mem op counter check
+
+def test_stepper_mem_ops():
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+
+    dims = 2
+    op, discr = get_strong_wave_op_with_discr(cl_ctx, dims=2, order=3)
+
+    t_start = 0
+    dt = 0.04
+    t_end = 0.2
+
+    from pytools.obj_array import join_fields
+    ic = join_fields(discr.zeros(queue),
+            [discr.zeros(queue) for i in range(discr.dim)])
+
+    bound_op = bind(
+            discr, op.sym_operator(),
+            exec_mapper_factory=MemOpCountingExecutionMapper)
+
+    if 1:
+        stepper = RK4TimeStepper(
+                queue, discr, "w", bound_op, 1 + discr.dim,
+                get_strong_wave_component,
+                exec_mapper_factory=MemOpCountingExecutionMapper)
+
+    else:
+        stepper = FusedRK4TimeStepper(
+                queue, discr, "w", op.sym_operator(), 1 + discr.dim,
+                get_strong_wave_component,
+                exec_mapper_factory=MemOpCountingExecutionMapper)
+
+    step = 0
+
+    norm = bind(discr, sym.norm(2, sym.var("u_ref") - sym.var("u")))
+
+    nsteps = int(np.ceil((t_end + 1e-9) / dt))
+    for (_, _, profile_data) in stepper.run(
+            ic, t_start, dt, t_end, return_profile_data=True):
+        step += 1
+        logger.info("step %d/%d", step, nsteps)
+
+    print("bytes read", profile_data["bytes_read"])
+    print("bytes written", profile_data["bytes_written"])
+
+# }}}
+
+
 if __name__ == "__main__":
-    test_stepper_equivalence()
+    #test_stepper_equivalence()
+    test_stepper_mem_ops()
-- 
GitLab


From d3d6043256dc5b0e7178dfffaaf86f84d7c9d3c4 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Thu, 4 Apr 2019 18:36:28 -0500
Subject: [PATCH 12/35] Add profiling to instructions in ExecutionMapper, and
 also add an exec_mapper_factory argument to bind()

---
 grudge/execution.py         | 27 +++++++++++++++------------
 grudge/symbolic/compiler.py |  2 +-
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/grudge/execution.py b/grudge/execution.py
index 9084b823..d215e0bb 100644
--- a/grudge/execution.py
+++ b/grudge/execution.py
@@ -332,7 +332,7 @@ class ExecutionMapper(mappers.Evaluator,
 
     # {{{ instruction execution functions
 
-    def map_insn_rank_data_swap(self, insn):
+    def map_insn_rank_data_swap(self, insn, profile_data=None):
         local_data = self.rec(insn.field).get(self.queue)
         comm = self.discrwb.mpi_communicator
 
@@ -347,7 +347,7 @@ class ExecutionMapper(mappers.Evaluator,
                 MPIRecvFuture(recv_req, insn.name, remote_data_host, self.queue),
                 MPISendFuture(send_req)]
 
-    def map_insn_loopy_kernel(self, insn):
+    def map_insn_loopy_kernel(self, insn, profile_data=None):
         kwargs = {}
         kdescr = insn.kernel_descriptor
         for name, expr in six.iteritems(kdescr.input_mappings):
@@ -370,11 +370,11 @@ class ExecutionMapper(mappers.Evaluator,
         evt, result_dict = kdescr.loopy_kernel(self.queue, **kwargs)
         return list(result_dict.items()), []
 
-    def map_insn_assign(self, insn):
+    def map_insn_assign(self, insn, profile_data=None):
         return [(name, self.rec(expr))
                 for name, expr in zip(insn.names, insn.exprs)], []
 
-    def map_insn_assign_to_discr_scoped(self, insn):
+    def map_insn_assign_to_discr_scoped(self, insn, profile_data=None):
         assignments = []
         for name, expr in zip(insn.names, insn.exprs):
             value = self.rec(expr)
@@ -383,11 +383,11 @@ class ExecutionMapper(mappers.Evaluator,
 
         return assignments, []
 
-    def map_insn_assign_from_discr_scoped(self, insn):
+    def map_insn_assign_from_discr_scoped(self, insn, profile_data=None):
         return [(insn.name,
             self.discrwb._discr_scoped_subexpr_name_to_value[insn.name])], []
 
-    def map_insn_diff_batch_assign(self, insn):
+    def map_insn_diff_batch_assign(self, insn, profile_data=None):
         field = self.rec(insn.field)
         repr_op = insn.operators[0]
         # FIXME: There's no real reason why differentiation is special,
@@ -483,13 +483,15 @@ class MPISendFuture(object):
 # {{{ bound operator
 
 class BoundOperator(object):
-    def __init__(self, discrwb, discr_code, eval_code, debug_flags, allocator=None):
+    def __init__(self, discrwb, discr_code, eval_code, debug_flags,
+                 allocator=None, exec_mapper_factory=ExecutionMapper):
         self.discrwb = discrwb
         self.discr_code = discr_code
         self.eval_code = eval_code
         self.operator_data_cache = {}
         self.debug_flags = debug_flags
         self.allocator = allocator
+        self.exec_mapper_factory = exec_mapper_factory
 
     def __str__(self):
         sep = 75 * "=" + "\n"
@@ -523,7 +525,7 @@ class BoundOperator(object):
             # need to do discrwb-scope evaluation
             discrwb_eval_context = {}
             self.discr_code.execute(
-                    ExecutionMapper(queue, discrwb_eval_context, self))
+                    self.exec_mapper_factory(queue, discrwb_eval_context, self))
 
         # }}}
 
@@ -532,7 +534,7 @@ class BoundOperator(object):
             new_context[name] = with_object_array_or_scalar(replace_queue, var)
 
         return self.eval_code.execute(
-                ExecutionMapper(queue, new_context, self),
+                self.exec_mapper_factory(queue, new_context, self),
                 profile_data=profile_data,
                 log_quantities=log_quantities)
 
@@ -634,8 +636,8 @@ def process_sym_operator(discrwb, sym_operator, post_bind_mapper=None,
 # }}}
 
 
-def bind(discr, sym_operator, post_bind_mapper=lambda x: x,
-        debug_flags=set(), allocator=None):
+def bind(discr, sym_operator, post_bind_mapper=lambda x: x, debug_flags=set(),
+         allocator=None, exec_mapper_factory=ExecutionMapper):
     # from grudge.symbolic.mappers import QuadratureUpsamplerRemover
     # sym_operator = QuadratureUpsamplerRemover(self.quad_min_degrees)(
     #         sym_operator)
@@ -661,7 +663,8 @@ def bind(discr, sym_operator, post_bind_mapper=lambda x: x,
     discr_code, eval_code = OperatorCompiler(discr)(sym_operator)
 
     bound_op = BoundOperator(discr, discr_code, eval_code,
-            debug_flags=debug_flags, allocator=allocator)
+            debug_flags=debug_flags, allocator=allocator,
+            exec_mapper_factory=exec_mapper_factory)
 
     if "dump_op_code" in debug_flags:
         from grudge.tools import open_unique_debug_file
diff --git a/grudge/symbolic/compiler.py b/grudge/symbolic/compiler.py
index 39313043..3d9379bd 100644
--- a/grudge/symbolic/compiler.py
+++ b/grudge/symbolic/compiler.py
@@ -526,7 +526,7 @@ class Code(object):
                                 log_quantities["rank_data_swap_timer"],
                                 log_quantities["rank_data_swap_counter"])
 
-                assignments, new_futures = mapper_method(insn)
+                assignments, new_futures = mapper_method(insn, profile_data)
 
                 for target, value in assignments:
                     if pre_assign_check is not None:
-- 
GitLab


From 9a02f5d4e2c65b0ce8e1ac0b08983c5b16ca8617 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sat, 6 Apr 2019 18:22:31 -0500
Subject: [PATCH 13/35] Code tidying, documentation

---
 examples/dagrt_fusion/fusion-study.py | 158 ++++++++++++++++++++++----
 1 file changed, 137 insertions(+), 21 deletions(-)

diff --git a/examples/dagrt_fusion/fusion-study.py b/examples/dagrt_fusion/fusion-study.py
index 4c451ef6..ab33eb91 100644
--- a/examples/dagrt_fusion/fusion-study.py
+++ b/examples/dagrt_fusion/fusion-study.py
@@ -1,6 +1,13 @@
+#!/usr/bin/env python3
+"""A study of operator fusion for time integration operators in Grudge.
+"""
+
 from __future__ import division, print_function
 
-__copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
+__copyright__ = """
+Copyright (C) 2015 Andreas Kloeckner
+Copyright (C) 2019 Matt Wala
+"""
 
 __license__ = """
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -28,13 +35,17 @@ import numpy as np
 import six
 import pyopencl as cl
 import pyopencl.array  # noqa
+import pytest
 
 import dagrt.language as lang
+import loopy as lp
 import pymbolic.primitives as p
 import grudge.symbolic.mappers as gmap
+import grudge.symbolic.operators as op
 from grudge.execution import ExecutionMapper
 from pymbolic.mapper.evaluator import EvaluationMapper \
         as PymbolicEvaluationMapper
+from pytools import memoize_in
 
 from grudge import sym, bind, DGDiscretizationWithBoundaries
 from leap.rk import LSRK4Method
@@ -44,6 +55,8 @@ logging.basicConfig(level=logging.INFO)
 
 logger = logging.getLogger(__name__)
 
+logger.setLevel(logging.DEBUG)
+
 
 # {{{ topological sort
 
@@ -100,6 +113,19 @@ class GrudgeArgSubstitutor(gmap.SymbolicEvaluator):
 
 def transcribe_phase(dag, field_var_name, field_components, phase_name,
                      sym_operator):
+    """Arguments:
+
+        dag: The Dagrt code for the time integrator
+
+        field_var_name: The name of the simulation variable
+
+        field_components: The number of components (fields) in the variable
+
+        phase_name: The name of the phase to transcribe in the Dagrt code
+
+        sym_operator: The Grudge symbolic expression to substitue for the
+            right-hand side evaluation in the Dagrt code
+    """
     sym_operator = gmap.OperatorBinder()(sym_operator)
     phase = dag.phases[phase_name]
 
@@ -269,6 +295,19 @@ class RK4TimeStepper(RK4TimeStepperBase):
 
     def __init__(self, queue, discr, field_var_name, grudge_bound_op,
                  num_fields, component_getter, exec_mapper_factory=ExecutionMapper):
+        """Arguments:
+
+            field_var_name: The name of the simulation variable
+
+            grudge_bound_op: The BoundExpression for the right-hand side
+
+            num_fields: The number of components in the simulation variable
+
+            component_getter: A function, which, given an object array
+               representing the simulation variable, splits the array into
+               its components
+
+        """
         from pymbolic import var
 
         # Construct sym_rhs to have the effect of replacing the RHS calls in the
@@ -322,14 +361,14 @@ class FusedRK4TimeStepper(RK4TimeStepperBase):
 
 # {{{ problem setup code
 
-def get_strong_wave_op_with_discr(cl_ctx, dims=3, order=4):
+def get_strong_wave_op_with_discr(cl_ctx, dims=2, order=4):
     from meshmode.mesh.generation import generate_regular_rect_mesh
     mesh = generate_regular_rect_mesh(
             a=(-0.5,)*dims,
             b=(0.5,)*dims,
             n=(16,)*dims)
 
-    logger.info("%d elements" % mesh.nelements)
+    logger.debug("%d elements" % mesh.nelements)
 
     discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order)
 
@@ -365,7 +404,7 @@ def get_strong_wave_component(state_component):
 # }}}
 
 
-# {{{ equivalence check
+# {{{ equivalence check between fused and non-fused versions
 
 def test_stepper_equivalence(order=4):
     cl_ctx = cl.create_some_context()
@@ -406,7 +445,7 @@ def test_stepper_equivalence(order=4):
 
     for t_ref, (u_ref, v_ref) in stepper.run(ic, t_start, dt, t_end):
         step += 1
-        logger.info("step %d/%d", step, nsteps)
+        logger.debug("step %d/%d", step, nsteps)
         t, (u, v) = next(fused_steps)
         assert t == t_ref, step
         assert norm(queue, u=u, u_ref=u_ref) <= 1e-13, step
@@ -417,14 +456,17 @@ def test_stepper_equivalence(order=4):
 # {{{ mem op counter implementation
 
 class MemOpCountingExecutionMapper(ExecutionMapper):
+    # This is a skeleton implementation that only has just enough functionality
+    # for the wave-min example to work.
 
     def __init__(self, queue, context, bound_op):
         super().__init__(queue, context, bound_op)
 
-    # {{{ expression mappings
+    def map_external_call(self, expr):
+        # Should have been caught by our op counter
+        assert False, ("map_external_call called: %s" % expr)
 
-    def map_common_subexpression(self, expr):
-        raise ValueError("CSE not expected")
+    # {{{ expressions
 
     def map_profiled_external_call(self, expr, profile_data):
         from pymbolic.primitives import Variable
@@ -432,19 +474,56 @@ class MemOpCountingExecutionMapper(ExecutionMapper):
         args = [self.rec(p) for p in expr.parameters]
         return self.context[expr.function.name](*args, profile_data=profile_data)
 
+    def map_profiled_essentially_elementwise_linear(self, op, field_expr, profile_data):
+        result = getattr(self, op.mapper_method)(op, field_expr)
+
+        if profile_data is not None:
+            # We model the cost to load the input and write the output.  In
+            # particular, we assume the elementwise matrices are negligible in
+            # size and thus ignorable.
+
+            field = self.rec(field_expr)
+            profile_data["bytes_read"] = (
+                    profile_data.get("bytes_read", 0) + field.nbytes)
+            profile_data["bytes_written"] = (
+                    profile_data.get("bytes_written", 0) + result.nbytes)
+
+        return result
+
     # }}}
-        
+
     # {{{ instruction mappings
-    
+
+    def process_assignment_expr(self, expr, profile_data):
+        if isinstance(expr, sym.ExternalCall):
+            assert expr.mapper_method == "map_external_call"
+            val = self.map_profiled_external_call(expr, profile_data)
+
+        elif isinstance(expr, sym.OperatorBinding):
+            if isinstance(
+                    expr.op,
+                    (
+                        # TODO: Not comprehensive.
+                        op.InterpolationOperator,
+                        op.RefFaceMassOperator,
+                        op.RefInverseMassOperator,
+                        op.OppositeInteriorFaceSwap)):
+                val = self.map_profiled_essentially_elementwise_linear(
+                        expr.op, expr.field, profile_data)
+
+            else:
+                assert False, ("unknown operator: %s" % expr.op)
+
+        else:
+            logger.debug("assignment not profiled: %s", expr)
+            val = self.rec(expr)
+
+        return val
+
     def map_insn_assign(self, insn, profile_data):
         result = []
         for name, expr in zip(insn.names, insn.exprs):
-            if isinstance(expr, sym.ExternalCall):
-                assert expr.mapper_method == "map_external_call"
-                val = self.map_profiled_external_call(expr, profile_data)
-            else:
-                val = self.rec(expr)
-            result.append((name, val))
+            result.append((name, self.process_assignment_expr(expr, profile_data)))
         return result, []
 
     def map_insn_loopy_kernel(self, insn, profile_data):
@@ -482,6 +561,42 @@ class MemOpCountingExecutionMapper(ExecutionMapper):
 
         return list(result_dict.items()), []
 
+    def map_insn_assign_to_discr_scoped(self, insn, profile_data=None):
+        assignments = []
+
+        for name, expr in zip(insn.names, insn.exprs):
+            logger.debug("assignment not profiled: %s <- %s", name, expr)
+            value = self.rec(expr)
+            self.discrwb._discr_scoped_subexpr_name_to_value[name] = value
+            assignments.append((name, value))
+
+        return assignments, []
+
+    def map_insn_assign_from_discr_scoped(self, insn, profile_data=None):
+        return [(insn.name,
+            self.discrwb._discr_scoped_subexpr_name_to_value[insn.name])], []
+
+    def map_insn_rank_data_swap(self, insn, profile_data):
+        raise NotImplementedError("no profiling for instruction: %s" % insn)
+
+    def map_insn_diff_batch_assign(self, insn, profile_data):
+        assignments, futures = super().map_insn_diff_batch_assign(insn)
+
+        if profile_data is not None:
+            # We model the cost to load the input and write the output.  In
+            # particular, we assume the elementwise matrices are negligible in
+            # size and thus ignorable.
+
+            field = self.rec(insn.field)
+            profile_data["bytes_read"] = (
+                    profile_data.get("bytes_read", 0) + field.nbytes)
+
+            for _, value in assignments:
+                profile_data["bytes_written"] = (
+                        profile_data.get("bytes_writte", 0) + value.nbytes)
+
+        return assignments, futures
+
     # }}}
 
 # }}}
@@ -489,7 +604,8 @@ class MemOpCountingExecutionMapper(ExecutionMapper):
 
 # {{{ mem op counter check
 
-def test_stepper_mem_ops():
+@pytest.mark.parametrize("use_fusion", (True, False))
+def test_stepper_mem_ops(use_fusion=True):
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
 
@@ -508,7 +624,7 @@ def test_stepper_mem_ops():
             discr, op.sym_operator(),
             exec_mapper_factory=MemOpCountingExecutionMapper)
 
-    if 1:
+    if not use_fusion:
         stepper = RK4TimeStepper(
                 queue, discr, "w", bound_op, 1 + discr.dim,
                 get_strong_wave_component,
@@ -528,10 +644,10 @@ def test_stepper_mem_ops():
     for (_, _, profile_data) in stepper.run(
             ic, t_start, dt, t_end, return_profile_data=True):
         step += 1
-        logger.info("step %d/%d", step, nsteps)
+        logger.debug("step %d/%d", step, nsteps)
 
-    print("bytes read", profile_data["bytes_read"])
-    print("bytes written", profile_data["bytes_written"])
+    logger.info("bytes read: %d", profile_data["bytes_read"])
+    logger.info("bytes written: %d", profile_data["bytes_written"])
 
 # }}}
 
-- 
GitLab


From 9d3f5ef9d1396f1efb9fb08fd9f5eddd2711f49d Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sat, 6 Apr 2019 18:29:48 -0500
Subject: [PATCH 14/35] More cleanup

---
 examples/dagrt_fusion/fusion-study.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/dagrt_fusion/fusion-study.py b/examples/dagrt_fusion/fusion-study.py
index ab33eb91..f130c248 100644
--- a/examples/dagrt_fusion/fusion-study.py
+++ b/examples/dagrt_fusion/fusion-study.py
@@ -212,6 +212,10 @@ def transcribe_phase(dag, field_var_name, field_components, phase_name,
 
 class RK4TimeStepperBase(object):
 
+    def __init__(self, queue, component_getter):
+        self.queue = queue
+        self.component_getter = component_getter
+
     def get_initial_context(self, fields, t_start, dt):
         from pytools.obj_array import join_fields
 
@@ -308,6 +312,8 @@ class RK4TimeStepper(RK4TimeStepperBase):
                its components
 
         """
+        super().__init__(queue, component_getter)
+
         from pymbolic import var
 
         # Construct sym_rhs to have the effect of replacing the RHS calls in the
@@ -351,10 +357,9 @@ class FusedRK4TimeStepper(RK4TimeStepperBase):
 
     def __init__(self, queue, discr, field_var_name, sym_rhs, num_fields,
                  component_getter, exec_mapper_factory=ExecutionMapper):
-        self.queue = queue
+        super().__init__(queue, component_getter)
         self.set_up_stepper(
                 discr, field_var_name, sym_rhs, num_fields, exec_mapper_factory)
-        self.component_getter = component_getter
 
 # }}}
 
@@ -605,7 +610,7 @@ class MemOpCountingExecutionMapper(ExecutionMapper):
 # {{{ mem op counter check
 
 @pytest.mark.parametrize("use_fusion", (True, False))
-def test_stepper_mem_ops(use_fusion=True):
+def test_stepper_mem_ops(use_fusion):
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
 
-- 
GitLab


From cfbc273f6e933745dba2e2b43a64d7a0f4a376bc Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Mon, 8 Apr 2019 23:00:41 -0500
Subject: [PATCH 15/35] More hackery

---
 examples/dagrt_fusion/fusion-study.py | 179 +++++++++++++++++++++++---
 1 file changed, 160 insertions(+), 19 deletions(-)

diff --git a/examples/dagrt_fusion/fusion-study.py b/examples/dagrt_fusion/fusion-study.py
index f130c248..3d300757 100644
--- a/examples/dagrt_fusion/fusion-study.py
+++ b/examples/dagrt_fusion/fusion-study.py
@@ -38,18 +38,19 @@ import pyopencl.array  # noqa
 import pytest
 
 import dagrt.language as lang
-import loopy as lp
 import pymbolic.primitives as p
 import grudge.symbolic.mappers as gmap
 import grudge.symbolic.operators as op
 from grudge.execution import ExecutionMapper
 from pymbolic.mapper.evaluator import EvaluationMapper \
         as PymbolicEvaluationMapper
-from pytools import memoize_in
 
 from grudge import sym, bind, DGDiscretizationWithBoundaries
 from leap.rk import LSRK4Method
 
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
 
 logging.basicConfig(level=logging.INFO)
 
@@ -71,7 +72,7 @@ def topological_sort(stmts, root_deps):
             return
 
         stmt = id_to_stmt[name]
-        for dep in stmt.depends_on:
+        for dep in sorted(stmt.depends_on):
             satisfy_dep(dep)
         ordered_stmts.append(stmt)
         satisfied.add(name)
@@ -333,7 +334,8 @@ class RK4TimeStepper(RK4TimeStepperBase):
 
         self.queue = queue
         self.grudge_bound_op = grudge_bound_op
-        self.set_up_stepper(discr, field_var_name, sym_rhs, num_fields, exec_mapper_factory)
+        self.set_up_stepper(
+                discr, field_var_name, sym_rhs, num_fields, exec_mapper_factory)
         self.component_getter = component_getter
 
     def _bound_op(self, t, *args, profile_data=None):
@@ -411,8 +413,8 @@ def get_strong_wave_component(state_component):
 
 # {{{ equivalence check between fused and non-fused versions
 
-def test_stepper_equivalence(order=4):
-    cl_ctx = cl.create_some_context()
+def test_stepper_equivalence(ctx_factory, order=4):
+    cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
 
     dims = 2
@@ -479,7 +481,8 @@ class MemOpCountingExecutionMapper(ExecutionMapper):
         args = [self.rec(p) for p in expr.parameters]
         return self.context[expr.function.name](*args, profile_data=profile_data)
 
-    def map_profiled_essentially_elementwise_linear(self, op, field_expr, profile_data):
+    def map_profiled_essentially_elementwise_linear(self, op, field_expr,
+                                                    profile_data):
         result = getattr(self, op.mapper_method)(op, field_expr)
 
         if profile_data is not None:
@@ -541,6 +544,9 @@ class MemOpCountingExecutionMapper(ExecutionMapper):
             if profile_data is not None and isinstance(val, pyopencl.array.Array):
                 profile_data["bytes_read"] = (
                         profile_data.get("bytes_read", 0) + val.nbytes)
+                profile_data["bytes_read_within_assignments"] = (
+                        profile_data.get("bytes_read_within_assignments", 0)
+                        + val.nbytes)
 
         discr = self.discrwb.discr_from_dd(kdescr.governing_dd)
         for name in kdescr.scalar_args():
@@ -563,6 +569,9 @@ class MemOpCountingExecutionMapper(ExecutionMapper):
             if profile_data is not None and isinstance(val, pyopencl.array.Array):
                 profile_data["bytes_written"] = (
                         profile_data.get("bytes_written", 0) + val.nbytes)
+                profile_data["bytes_written_within_assignments"] = (
+                        profile_data.get("bytes_written_within_assignments", 0)
+                        + val.nbytes)
 
         return list(result_dict.items()), []
 
@@ -598,7 +607,7 @@ class MemOpCountingExecutionMapper(ExecutionMapper):
 
             for _, value in assignments:
                 profile_data["bytes_written"] = (
-                        profile_data.get("bytes_writte", 0) + value.nbytes)
+                        profile_data.get("bytes_written", 0) + value.nbytes)
 
         return assignments, futures
 
@@ -610,12 +619,13 @@ class MemOpCountingExecutionMapper(ExecutionMapper):
 # {{{ mem op counter check
 
 @pytest.mark.parametrize("use_fusion", (True, False))
-def test_stepper_mem_ops(use_fusion):
-    cl_ctx = cl.create_some_context()
+def test_stepper_mem_ops(ctx_factory, use_fusion):
+    cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
 
     dims = 2
-    op, discr = get_strong_wave_op_with_discr(cl_ctx, dims=2, order=3)
+
+    op, discr = get_strong_wave_op_with_discr(cl_ctx, dims=dims, order=3)
 
     t_start = 0
     dt = 0.04
@@ -625,11 +635,11 @@ def test_stepper_mem_ops(use_fusion):
     ic = join_fields(discr.zeros(queue),
             [discr.zeros(queue) for i in range(discr.dim)])
 
-    bound_op = bind(
-            discr, op.sym_operator(),
-            exec_mapper_factory=MemOpCountingExecutionMapper)
-
     if not use_fusion:
+        bound_op = bind(
+                discr, op.sym_operator(),
+                exec_mapper_factory=MemOpCountingExecutionMapper)
+
         stepper = RK4TimeStepper(
                 queue, discr, "w", bound_op, 1 + discr.dim,
                 get_strong_wave_component,
@@ -643,20 +653,151 @@ def test_stepper_mem_ops(use_fusion):
 
     step = 0
 
-    norm = bind(discr, sym.norm(2, sym.var("u_ref") - sym.var("u")))
-
     nsteps = int(np.ceil((t_end + 1e-9) / dt))
     for (_, _, profile_data) in stepper.run(
             ic, t_start, dt, t_end, return_profile_data=True):
         step += 1
-        logger.debug("step %d/%d", step, nsteps)
+        logger.info("step %d/%d: %f", step, nsteps)
 
+    logger.info("fusion? %s", use_fusion)
     logger.info("bytes read: %d", profile_data["bytes_read"])
     logger.info("bytes written: %d", profile_data["bytes_written"])
+    logger.info("bytes total: %d",
+            profile_data["bytes_read"] + profile_data["bytes_written"])
+
+# }}}
+
+
+# {{{ paper outputs
+
+def get_example_stepper(queue, dims=2, order=3, use_fusion=True,
+                        exec_mapper_factory=ExecutionMapper,
+                        return_ic=False):
+    op, discr = get_strong_wave_op_with_discr(queue.context, dims=dims, order=3)
+
+    if not use_fusion:
+        bound_op = bind(
+                discr, op.sym_operator(),
+                exec_mapper_factory=exec_mapper_factory)
+
+        stepper = RK4TimeStepper(
+                queue, discr, "w", bound_op, 1 + discr.dim,
+                get_strong_wave_component,
+                exec_mapper_factory=exec_mapper_factory)
+
+    else:
+        stepper = FusedRK4TimeStepper(
+                queue, discr, "w", op.sym_operator(), 1 + discr.dim,
+                get_strong_wave_component,
+                exec_mapper_factory=exec_mapper_factory)
+
+    if return_ic:
+        from pytools.obj_array import join_fields
+        ic = join_fields(discr.zeros(queue),
+                [discr.zeros(queue) for i in range(discr.dim)])
+        return stepper, ic
+
+    return stepper
+
+
+def statement_counts_table():
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+
+    fused_stepper = get_example_stepper(queue, use_fusion=True)
+    stepper = get_example_stepper(queue, use_fusion=False)
+
+    print(r"\begin{tabular}{lr}")
+    print(r"\toprule")
+    print(r"Operator & Grudge Node Count \\")
+    print(r"\midrule")
+    print(
+            r"Time integration (not fused) & %d \\"
+            % len(stepper.bound_op.eval_code.instructions))
+    print(
+            r"Right-hand side (not fused) & %d \\"
+            % len(stepper.grudge_bound_op.eval_code.instructions))
+    print(
+            r"Fused operator & %d \\"
+            % len(fused_stepper.bound_op.eval_code.instructions))
+    print(r"\bottomrule")
+    print(r"\end{tabular}")
+
+
+"""
+def graphs():
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+
+    fused_stepper = get_example_stepper(queue, use_fusion=True)
+    stepper = get_example_stepper(queue, use_fusion=False)
+
+    from grudge.symbolic.compiler import dot_dataflow_graph
+"""
+
+
+def mem_ops_table():
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+
+    fused_stepper = get_example_stepper(
+            queue,
+            use_fusion=True,
+            exec_mapper_factory=MemOpCountingExecutionMapper)
+
+    stepper, ic = get_example_stepper(
+            queue,
+            use_fusion=False,
+            exec_mapper_factory=MemOpCountingExecutionMapper,
+            return_ic=True)
+
+    t_start = 0
+    dt = 0.02
+    t_end = 0.02
+
+    for (_, _, profile_data) in stepper.run(
+            ic, t_start, dt, t_end, return_profile_data=True):
+        pass
+
+    nonfused_bytes_read = profile_data["bytes_read"]
+    nonfused_bytes_written = profile_data["bytes_written"]
+    nonfused_bytes_total = nonfused_bytes_read + nonfused_bytes_written
+
+    for (_, _, profile_data) in fused_stepper.run(
+            ic, t_start, dt, t_end, return_profile_data=True):
+        pass
+
+    fused_bytes_read = profile_data["bytes_read"]
+    fused_bytes_written = profile_data["bytes_written"]
+    fused_bytes_total = fused_bytes_read + fused_bytes_written
+
+    print(r"\begin{tabular}{lrrr}")
+    print(r"\toprule")
+    print(r"Operator & Bytes Read & Bytes Written & Total (\% of Baseline) \\")
+    print(r"\midrule")
+    print(
+            r"Baseline & \num{%d} & \num{%d} & \num{%d} (100) \\"
+            % (
+                nonfused_bytes_read,
+                nonfused_bytes_written,
+                nonfused_bytes_total))
+    print(
+            r"Fused & \num{%d} & \num{%d} & \num{%d} (%.1f) \\"
+            % (
+                fused_bytes_read,
+                fused_bytes_written,
+                fused_bytes_total,
+                100 * fused_bytes_total / nonfused_bytes_total))
+    print(r"\bottomrule")
+    print(r"\end{tabular}")
 
 # }}}
 
 
 if __name__ == "__main__":
     #test_stepper_equivalence()
-    test_stepper_mem_ops()
+    #test_stepper_mem_ops(True)
+    #test_stepper_mem_ops(False)
+    #statement_counts_table()
+    #mem_ops_table()
+    pass
-- 
GitLab


From 349e5968d72ffc687f80d756a289fe9e49ec5dbd Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Tue, 9 Apr 2019 01:04:13 -0500
Subject: [PATCH 16/35] More experiments

---
 examples/dagrt_fusion/fusion-study.py | 212 ++++++++++++++++++++++++--
 grudge/symbolic/compiler.py           |   2 +-
 2 files changed, 203 insertions(+), 11 deletions(-)

diff --git a/examples/dagrt_fusion/fusion-study.py b/examples/dagrt_fusion/fusion-study.py
index 3d300757..a959f4d9 100644
--- a/examples/dagrt_fusion/fusion-study.py
+++ b/examples/dagrt_fusion/fusion-study.py
@@ -373,7 +373,7 @@ def get_strong_wave_op_with_discr(cl_ctx, dims=2, order=4):
     mesh = generate_regular_rect_mesh(
             a=(-0.5,)*dims,
             b=(0.5,)*dims,
-            n=(16,)*dims)
+            n=(256,)*dims)
 
     logger.debug("%d elements" % mesh.nelements)
 
@@ -462,7 +462,7 @@ def test_stepper_equivalence(ctx_factory, order=4):
 
 # {{{ mem op counter implementation
 
-class MemOpCountingExecutionMapper(ExecutionMapper):
+class ExecutionMapperWithMemOpCounting(ExecutionMapper):
     # This is a skeleton implementation that only has just enough functionality
     # for the wave-min example to work.
 
@@ -638,18 +638,18 @@ def test_stepper_mem_ops(ctx_factory, use_fusion):
     if not use_fusion:
         bound_op = bind(
                 discr, op.sym_operator(),
-                exec_mapper_factory=MemOpCountingExecutionMapper)
+                exec_mapper_factory=ExecutionMapperWithMemOpCounting)
 
         stepper = RK4TimeStepper(
                 queue, discr, "w", bound_op, 1 + discr.dim,
                 get_strong_wave_component,
-                exec_mapper_factory=MemOpCountingExecutionMapper)
+                exec_mapper_factory=ExecutionMapperWithMemOpCounting)
 
     else:
         stepper = FusedRK4TimeStepper(
                 queue, discr, "w", op.sym_operator(), 1 + discr.dim,
                 get_strong_wave_component,
-                exec_mapper_factory=MemOpCountingExecutionMapper)
+                exec_mapper_factory=ExecutionMapperWithMemOpCounting)
 
     step = 0
 
@@ -657,7 +657,7 @@ def test_stepper_mem_ops(ctx_factory, use_fusion):
     for (_, _, profile_data) in stepper.run(
             ic, t_start, dt, t_end, return_profile_data=True):
         step += 1
-        logger.info("step %d/%d: %f", step, nsteps)
+        logger.info("step %d/%d", step, nsteps)
 
     logger.info("fusion? %s", use_fusion)
     logger.info("bytes read: %d", profile_data["bytes_read"])
@@ -668,6 +668,141 @@ def test_stepper_mem_ops(ctx_factory, use_fusion):
 # }}}
 
 
+# {{{ execution mapper with timing
+
+def time_insn(f):
+    from pytools import ProcessTimer
+    time_field_name = "time_%s" % f.__name__
+
+    def wrapper(self, insn, profile_data):
+        timer = ProcessTimer()
+        retval = f(self, insn, profile_data)
+        timer.done()
+
+        if profile_data is not None:
+            profile_data[time_field_name] = (
+                    profile_data.get(time_field_name, 0)
+                    + timer.wall_elapsed)
+
+        return retval
+
+    return wrapper
+
+
+class ExecutionMapperWithTiming(ExecutionMapper):
+
+    def map_external_call(self, expr):
+        # Should have been caught by our implementation.
+        assert False, ("map_external_call called: %s" % (expr))
+        
+    def map_operator_binding(self, expr):
+        # Should have been caught by our implementation.
+        assert False, ("map_operator_binding called: %s" % expr)
+
+    def map_profiled_external_call(self, expr, profile_data):
+        from pymbolic.primitives import Variable
+        assert isinstance(expr.function, Variable)
+        args = [self.rec(p) for p in expr.parameters]
+        return self.context[expr.function.name](*args, profile_data=profile_data)
+
+    def map_profiled_operator_binding(self, expr, profile_data):
+        from pytools import ProcessTimer
+        timer = ProcessTimer()
+        retval = super().map_operator_binding(expr)
+        timer.done()
+        if profile_data is not None:
+            time_field_name = "time_op_%s" % expr.op.mapper_method
+            profile_data[time_field_name] = (
+                    profile_data.get(time_field_name, 0)
+                    + timer.wall_elapsed)
+        return retval
+
+    @time_insn
+    def map_insn_loopy_kernel(self, *args, **kwargs):
+        return super().map_insn_loopy_kernel(*args, **kwargs)
+
+    @time_insn
+    def map_insn_assign(self, insn, profile_data):
+        if len(insn.exprs) == 1:
+            if isinstance(insn.exprs[0], sym.ExternalCall):
+                assert insn.exprs[0].mapper_method == "map_external_call"
+                val = self.map_profiled_external_call(insn.exprs[0], profile_data)
+                return [(insn.names[0], val)], []
+            elif isinstance(insn.exprs[0], sym.OperatorBinding):
+                assert insn.exprs[0].mapper_method == "map_operator_binding"
+                val = self.map_profiled_operator_binding(insn.exprs[0], profile_data)
+                return [(insn.names[0], val)], []
+                                                  
+        return super().map_insn_assign(insn, profile_data)
+
+    @time_insn
+    def map_insn_assign_to_discr_scoped(self, insn, profile_data):
+        return super().map_insn_assign_to_discr_scoped(insn, profile_data)
+
+    @time_insn
+    def map_insn_assign_from_discr_scoped(self, insn, profile_data):
+        return super().map_insn_assign_from_discr_scoped(insn, profile_data)
+
+    @time_insn
+    def map_insn_diff_batch_assign(self, insn, profile_data):
+        return super().map_insn_diff_batch_assign(insn, profile_data)
+
+# }}}
+
+
+# {{{ timing check
+
+@pytest.mark.parametrize("use_fusion", (True, False))
+def test_stepper_timing(ctx_factory, use_fusion):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+
+    dims = 2
+
+    op, discr = get_strong_wave_op_with_discr(cl_ctx, dims=dims, order=3)
+
+    t_start = 0
+    dt = 0.04
+    t_end = 0.1
+
+    from pytools.obj_array import join_fields
+    ic = join_fields(discr.zeros(queue),
+            [discr.zeros(queue) for i in range(discr.dim)])
+
+    if not use_fusion:
+        bound_op = bind(
+                discr, op.sym_operator(),
+                exec_mapper_factory=ExecutionMapperWithTiming)
+
+        stepper = RK4TimeStepper(
+                queue, discr, "w", bound_op, 1 + discr.dim,
+                get_strong_wave_component,
+                exec_mapper_factory=ExecutionMapperWithTiming)
+
+    else:
+        stepper = FusedRK4TimeStepper(
+                queue, discr, "w", op.sym_operator(), 1 + discr.dim,
+                get_strong_wave_component,
+                exec_mapper_factory=ExecutionMapperWithTiming)
+
+    step = 0
+
+    import time
+    t = time.time()
+    nsteps = int(np.ceil((t_end + 1e-9) / dt))
+    for (_, _, profile_data) in stepper.run(
+            ic, t_start, dt, t_end, return_profile_data=True):
+        step += 1
+        tn = time.time()
+        logger.info("step %d/%d: %f", step, nsteps, tn - t)
+        t = tn
+
+    logger.info("fusion? %s", use_fusion)
+    logger.info(profile_data)
+
+# }}}
+
+
 # {{{ paper outputs
 
 def get_example_stepper(queue, dims=2, order=3, use_fusion=True,
@@ -743,12 +878,68 @@ def mem_ops_table():
     fused_stepper = get_example_stepper(
             queue,
             use_fusion=True,
-            exec_mapper_factory=MemOpCountingExecutionMapper)
+            exec_mapper_factory=ExecutionMapperWithMemOpCounting)
+
+    stepper, ic = get_example_stepper(
+            queue,
+            use_fusion=False,
+            exec_mapper_factory=ExecutionMapperWithMemOpCounting,
+            return_ic=True)
+
+    t_start = 0
+    dt = 0.02
+    t_end = 0.02
+
+    for (_, _, profile_data) in stepper.run(
+            ic, t_start, dt, t_end, return_profile_data=True):
+        pass
+
+    nonfused_bytes_read = profile_data["bytes_read"]
+    nonfused_bytes_written = profile_data["bytes_written"]
+    nonfused_bytes_total = nonfused_bytes_read + nonfused_bytes_written
+
+    for (_, _, profile_data) in fused_stepper.run(
+            ic, t_start, dt, t_end, return_profile_data=True):
+        pass
+
+    fused_bytes_read = profile_data["bytes_read"]
+    fused_bytes_written = profile_data["bytes_written"]
+    fused_bytes_total = fused_bytes_read + fused_bytes_written
+
+    print(r"\begin{tabular}{lrrr}")
+    print(r"\toprule")
+    print(r"Operator & Bytes Read & Bytes Written & Total (\% of Baseline) \\")
+    print(r"\midrule")
+    print(
+            r"Baseline & \num{%d} & \num{%d} & \num{%d} (100) \\"
+            % (
+                nonfused_bytes_read,
+                nonfused_bytes_written,
+                nonfused_bytes_total))
+    print(
+            r"Fused & \num{%d} & \num{%d} & \num{%d} (%.1f) \\"
+            % (
+                fused_bytes_read,
+                fused_bytes_written,
+                fused_bytes_total,
+                100 * fused_bytes_total / nonfused_bytes_total))
+    print(r"\bottomrule")
+    print(r"\end{tabular}")
+
+
+def mem_ops_table():
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+
+    fused_stepper = get_example_stepper(
+            queue,
+            use_fusion=True,
+            exec_mapper_factory=ExecutionMapperWithMemOpCounting)
 
     stepper, ic = get_example_stepper(
             queue,
             use_fusion=False,
-            exec_mapper_factory=MemOpCountingExecutionMapper,
+            exec_mapper_factory=ExecutionMapperWithMemOpCounting,
             return_ic=True)
 
     t_start = 0
@@ -799,5 +990,6 @@ if __name__ == "__main__":
     #test_stepper_mem_ops(True)
     #test_stepper_mem_ops(False)
     #statement_counts_table()
-    #mem_ops_table()
-    pass
+    mem_ops_table()
+    #pass
+    #test_stepper_timing(cl._csc, False)
diff --git a/grudge/symbolic/compiler.py b/grudge/symbolic/compiler.py
index 6bd950ed..81a0af6a 100644
--- a/grudge/symbolic/compiler.py
+++ b/grudge/symbolic/compiler.py
@@ -590,7 +590,7 @@ class Code(object):
             exec_sub_timer.stop().submit()
         from pytools.obj_array import with_object_array_or_scalar
         if profile_data is not None:
-            profile_data['total_time'] += time() - start_time
+            profile_data['total_time'] = time() - start_time
             return (with_object_array_or_scalar(exec_mapper, self.result),
                     profile_data)
         return with_object_array_or_scalar(exec_mapper, self.result)
-- 
GitLab


From 318e17208098ec3115b9184fb96ac4a3be3659ad Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Wed, 10 Apr 2019 19:03:11 -0500
Subject: [PATCH 17/35] Add timing

---
 examples/dagrt_fusion/fusion-study.py | 182 +++++++++++++-------------
 1 file changed, 90 insertions(+), 92 deletions(-)

diff --git a/examples/dagrt_fusion/fusion-study.py b/examples/dagrt_fusion/fusion-study.py
index a959f4d9..b960ae5f 100644
--- a/examples/dagrt_fusion/fusion-study.py
+++ b/examples/dagrt_fusion/fusion-study.py
@@ -373,7 +373,7 @@ def get_strong_wave_op_with_discr(cl_ctx, dims=2, order=4):
     mesh = generate_regular_rect_mesh(
             a=(-0.5,)*dims,
             b=(0.5,)*dims,
-            n=(256,)*dims)
+            n=(16,)*dims)
 
     logger.debug("%d elements" % mesh.nelements)
 
@@ -496,6 +496,12 @@ class ExecutionMapperWithMemOpCounting(ExecutionMapper):
             profile_data["bytes_written"] = (
                     profile_data.get("bytes_written", 0) + result.nbytes)
 
+            if op.mapper_method == "map_interpolation":
+                profile_data["interp_bytes_read"] = (
+                        profile_data.get("interp_bytes_read", 0) + field.nbytes)
+                profile_data["interp_bytes_written"] = (
+                        profile_data.get("interp_bytes_written", 0) + result.nbytes)
+
         return result
 
     # }}}
@@ -670,19 +676,63 @@ def test_stepper_mem_ops(ctx_factory, use_fusion):
 
 # {{{ execution mapper with timing
 
+
+SECONDS_PER_NANOSECOND = 10**9
+
+
+class TimingFuture(object):
+
+    def __init__(self, start_event, stop_event):
+        self.start_event = start_event
+        self.stop_event = stop_event
+
+    def elapsed(self):
+        cl.wait_for_events([self.start_event, self.stop_event])
+        return (
+                self.stop_event.profile.end
+                - self.start_event.profile.end) / SECONDS_PER_NANOSECOND
+
+
+from collections.abc import MutableSequence
+
+
+class TimingFutureList(MutableSequence):
+
+    def __init__(self, *args, **kwargs):
+        self._list = list(*args, **kwargs)
+
+    def __len__(self):
+        return len(self._list)
+
+    def __getitem__(self, idx):
+        return self._list[idx]
+
+    def __setitem__(self, idx, val):
+        self._list[idx] = val
+
+    def __delitem__(self, idx):
+        del self._list[idx]
+
+    def insert(self, idx, val):
+        self._list.insert(idx, val)
+
+    def elapsed(self):
+        return sum(future.elapsed() for future in self._list)
+
+
 def time_insn(f):
-    from pytools import ProcessTimer
     time_field_name = "time_%s" % f.__name__
 
     def wrapper(self, insn, profile_data):
-        timer = ProcessTimer()
-        retval = f(self, insn, profile_data)
-        timer.done()
+        if profile_data is None:
+            return f(self, insn, profile_data)
 
-        if profile_data is not None:
-            profile_data[time_field_name] = (
-                    profile_data.get(time_field_name, 0)
-                    + timer.wall_elapsed)
+        start = cl.enqueue_marker(self.queue)
+        retval = f(self, insn, profile_data)
+        end = cl.enqueue_marker(self.queue)
+        profile_data\
+                .setdefault(time_field_name, TimingFutureList())\
+                .append(TimingFuture(start, end))
 
         return retval
 
@@ -694,7 +744,7 @@ class ExecutionMapperWithTiming(ExecutionMapper):
     def map_external_call(self, expr):
         # Should have been caught by our implementation.
         assert False, ("map_external_call called: %s" % (expr))
-        
+
     def map_operator_binding(self, expr):
         # Should have been caught by our implementation.
         assert False, ("map_operator_binding called: %s" % expr)
@@ -706,22 +756,23 @@ class ExecutionMapperWithTiming(ExecutionMapper):
         return self.context[expr.function.name](*args, profile_data=profile_data)
 
     def map_profiled_operator_binding(self, expr, profile_data):
-        from pytools import ProcessTimer
-        timer = ProcessTimer()
+        if profile_data is None:
+            return super().map_operator_binding(expr)
+
+        start = cl.enqueue_marker(self.queue)
         retval = super().map_operator_binding(expr)
-        timer.done()
-        if profile_data is not None:
-            time_field_name = "time_op_%s" % expr.op.mapper_method
-            profile_data[time_field_name] = (
-                    profile_data.get(time_field_name, 0)
-                    + timer.wall_elapsed)
+        end = cl.enqueue_marker(self.queue)
+        time_field_name = "time_op_%s" % expr.op.mapper_method
+        profile_data\
+                .setdefault(time_field_name, TimingFutureList())\
+                .append(TimingFuture(start, end))
+
         return retval
 
     @time_insn
     def map_insn_loopy_kernel(self, *args, **kwargs):
         return super().map_insn_loopy_kernel(*args, **kwargs)
 
-    @time_insn
     def map_insn_assign(self, insn, profile_data):
         if len(insn.exprs) == 1:
             if isinstance(insn.exprs[0], sym.ExternalCall):
@@ -732,16 +783,8 @@ class ExecutionMapperWithTiming(ExecutionMapper):
                 assert insn.exprs[0].mapper_method == "map_operator_binding"
                 val = self.map_profiled_operator_binding(insn.exprs[0], profile_data)
                 return [(insn.names[0], val)], []
-                                                  
-        return super().map_insn_assign(insn, profile_data)
 
-    @time_insn
-    def map_insn_assign_to_discr_scoped(self, insn, profile_data):
-        return super().map_insn_assign_to_discr_scoped(insn, profile_data)
-
-    @time_insn
-    def map_insn_assign_from_discr_scoped(self, insn, profile_data):
-        return super().map_insn_assign_from_discr_scoped(insn, profile_data)
+        return super().map_insn_assign(insn, profile_data)
 
     @time_insn
     def map_insn_diff_batch_assign(self, insn, profile_data):
@@ -755,9 +798,11 @@ class ExecutionMapperWithTiming(ExecutionMapper):
 @pytest.mark.parametrize("use_fusion", (True, False))
 def test_stepper_timing(ctx_factory, use_fusion):
     cl_ctx = ctx_factory()
-    queue = cl.CommandQueue(cl_ctx)
+    queue = cl.CommandQueue(
+            cl_ctx,
+            properties=cl.command_queue_properties.PROFILING_ENABLE)
 
-    dims = 2
+    dims = 3
 
     op, discr = get_strong_wave_op_with_discr(cl_ctx, dims=dims, order=3)
 
@@ -798,7 +843,9 @@ def test_stepper_timing(ctx_factory, use_fusion):
         t = tn
 
     logger.info("fusion? %s", use_fusion)
-    logger.info(profile_data)
+    for key, value in profile_data.items():
+        if isinstance(value, TimingFutureList):
+            print(key, value.elapsed())
 
 # }}}
 
@@ -808,6 +855,7 @@ def test_stepper_timing(ctx_factory, use_fusion):
 def get_example_stepper(queue, dims=2, order=3, use_fusion=True,
                         exec_mapper_factory=ExecutionMapper,
                         return_ic=False):
+    print("DIMS", dims)
     op, discr = get_strong_wave_op_with_discr(queue.context, dims=dims, order=3)
 
     if not use_fusion:
@@ -877,11 +925,13 @@ def mem_ops_table():
 
     fused_stepper = get_example_stepper(
             queue,
+            dims=3,
             use_fusion=True,
             exec_mapper_factory=ExecutionMapperWithMemOpCounting)
 
     stepper, ic = get_example_stepper(
             queue,
+            dims=3,
             use_fusion=False,
             exec_mapper_factory=ExecutionMapperWithMemOpCounting,
             return_ic=True)
@@ -898,61 +948,7 @@ def mem_ops_table():
     nonfused_bytes_written = profile_data["bytes_written"]
     nonfused_bytes_total = nonfused_bytes_read + nonfused_bytes_written
 
-    for (_, _, profile_data) in fused_stepper.run(
-            ic, t_start, dt, t_end, return_profile_data=True):
-        pass
-
-    fused_bytes_read = profile_data["bytes_read"]
-    fused_bytes_written = profile_data["bytes_written"]
-    fused_bytes_total = fused_bytes_read + fused_bytes_written
-
-    print(r"\begin{tabular}{lrrr}")
-    print(r"\toprule")
-    print(r"Operator & Bytes Read & Bytes Written & Total (\% of Baseline) \\")
-    print(r"\midrule")
-    print(
-            r"Baseline & \num{%d} & \num{%d} & \num{%d} (100) \\"
-            % (
-                nonfused_bytes_read,
-                nonfused_bytes_written,
-                nonfused_bytes_total))
-    print(
-            r"Fused & \num{%d} & \num{%d} & \num{%d} (%.1f) \\"
-            % (
-                fused_bytes_read,
-                fused_bytes_written,
-                fused_bytes_total,
-                100 * fused_bytes_total / nonfused_bytes_total))
-    print(r"\bottomrule")
-    print(r"\end{tabular}")
-
-
-def mem_ops_table():
-    cl_ctx = cl.create_some_context()
-    queue = cl.CommandQueue(cl_ctx)
-
-    fused_stepper = get_example_stepper(
-            queue,
-            use_fusion=True,
-            exec_mapper_factory=ExecutionMapperWithMemOpCounting)
-
-    stepper, ic = get_example_stepper(
-            queue,
-            use_fusion=False,
-            exec_mapper_factory=ExecutionMapperWithMemOpCounting,
-            return_ic=True)
-
-    t_start = 0
-    dt = 0.02
-    t_end = 0.02
-
-    for (_, _, profile_data) in stepper.run(
-            ic, t_start, dt, t_end, return_profile_data=True):
-        pass
-
-    nonfused_bytes_read = profile_data["bytes_read"]
-    nonfused_bytes_written = profile_data["bytes_written"]
-    nonfused_bytes_total = nonfused_bytes_read + nonfused_bytes_written
+    print("profile data, non fused", profile_data)
 
     for (_, _, profile_data) in fused_stepper.run(
             ic, t_start, dt, t_end, return_profile_data=True):
@@ -962,18 +958,20 @@ def mem_ops_table():
     fused_bytes_written = profile_data["bytes_written"]
     fused_bytes_total = fused_bytes_read + fused_bytes_written
 
-    print(r"\begin{tabular}{lrrr}")
+    print("profile data, fused", profile_data)
+
+    print(r"\begin{tabular}{lrrrr}")
     print(r"\toprule")
-    print(r"Operator & Bytes Read & Bytes Written & Total (\% of Baseline) \\")
+    print(r"Operator & Bytes Read & Bytes Written & Total & \% of Baseline \\")
     print(r"\midrule")
     print(
-            r"Baseline & \num{%d} & \num{%d} & \num{%d} (100) \\"
+            r"Baseline & \num{%d} & \num{%d} & \num{%d} & 100 \\"
             % (
                 nonfused_bytes_read,
                 nonfused_bytes_written,
                 nonfused_bytes_total))
     print(
-            r"Fused & \num{%d} & \num{%d} & \num{%d} (%.1f) \\"
+            r"Fused & \num{%d} & \num{%d} & \num{%d} & \num{%.1f} \\"
             % (
                 fused_bytes_read,
                 fused_bytes_written,
@@ -990,6 +988,6 @@ if __name__ == "__main__":
     #test_stepper_mem_ops(True)
     #test_stepper_mem_ops(False)
     #statement_counts_table()
-    mem_ops_table()
+    #mem_ops_table()
     #pass
-    #test_stepper_timing(cl._csc, False)
+    test_stepper_timing(cl._csc, False)
-- 
GitLab


From a25cba4219540e1b9fd044827c678ae619394250 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Tue, 16 Apr 2019 18:57:09 -0500
Subject: [PATCH 18/35] Updates

---
 examples/dagrt_fusion/fusion-study.py | 41 +++++++--------------------
 setup.py                              |  4 +--
 2 files changed, 12 insertions(+), 33 deletions(-)

diff --git a/examples/dagrt_fusion/fusion-study.py b/examples/dagrt_fusion/fusion-study.py
index b960ae5f..4af9756c 100644
--- a/examples/dagrt_fusion/fusion-study.py
+++ b/examples/dagrt_fusion/fusion-study.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""A study of operator fusion for time integration operators in Grudge.
+"""Study of operator fusion (inlining) for time integration operators in Grudge.
 """
 
 from __future__ import division, print_function
@@ -114,15 +114,17 @@ class GrudgeArgSubstitutor(gmap.SymbolicEvaluator):
 
 def transcribe_phase(dag, field_var_name, field_components, phase_name,
                      sym_operator):
-    """Arguments:
+    """Generate a Grudge operator for a Dagrt time integrator phase.
 
-        dag: The Dagrt code for the time integrator
+    Arguments:
+
+        dag: The Dagrt code object for the time integrator
 
         field_var_name: The name of the simulation variable
 
         field_components: The number of components (fields) in the variable
 
-        phase_name: The name of the phase to transcribe in the Dagrt code
+        phase_name: The name of the phase to transcribe
 
         sym_operator: The Grudge symbolic expression to substitue for the
             right-hand side evaluation in the Dagrt code
@@ -160,7 +162,7 @@ def transcribe_phase(dag, field_var_name, field_components, phase_name,
         if isinstance(stmt, lang.Nop):
             pass
 
-        elif isinstance(stmt, lang.AssignExpression):
+        elif isinstance(stmt, lang.Assign):
             if not isinstance(stmt.lhs, p.Variable):
                 raise NotImplementedError("lhs of statement %s is not a variable: %s"
                         % (stmt.id, stmt.lhs))
@@ -665,7 +667,7 @@ def test_stepper_mem_ops(ctx_factory, use_fusion):
         step += 1
         logger.info("step %d/%d", step, nsteps)
 
-    logger.info("fusion? %s", use_fusion)
+    logger.info("using fusion? %s", use_fusion)
     logger.info("bytes read: %d", profile_data["bytes_read"])
     logger.info("bytes written: %d", profile_data["bytes_written"])
     logger.info("bytes total: %d",
@@ -676,7 +678,6 @@ def test_stepper_mem_ops(ctx_factory, use_fusion):
 
 # {{{ execution mapper with timing
 
-
 SECONDS_PER_NANOSECOND = 10**9
 
 
@@ -855,7 +856,6 @@ def test_stepper_timing(ctx_factory, use_fusion):
 def get_example_stepper(queue, dims=2, order=3, use_fusion=True,
                         exec_mapper_factory=ExecutionMapper,
                         return_ic=False):
-    print("DIMS", dims)
     op, discr = get_strong_wave_op_with_discr(queue.context, dims=dims, order=3)
 
     if not use_fusion:
@@ -907,18 +907,6 @@ def statement_counts_table():
     print(r"\end{tabular}")
 
 
-"""
-def graphs():
-    cl_ctx = cl.create_some_context()
-    queue = cl.CommandQueue(cl_ctx)
-
-    fused_stepper = get_example_stepper(queue, use_fusion=True)
-    stepper = get_example_stepper(queue, use_fusion=False)
-
-    from grudge.symbolic.compiler import dot_dataflow_graph
-"""
-
-
 def mem_ops_table():
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
@@ -948,8 +936,6 @@ def mem_ops_table():
     nonfused_bytes_written = profile_data["bytes_written"]
     nonfused_bytes_total = nonfused_bytes_read + nonfused_bytes_written
 
-    print("profile data, non fused", profile_data)
-
     for (_, _, profile_data) in fused_stepper.run(
             ic, t_start, dt, t_end, return_profile_data=True):
         pass
@@ -958,8 +944,6 @@ def mem_ops_table():
     fused_bytes_written = profile_data["bytes_written"]
     fused_bytes_total = fused_bytes_read + fused_bytes_written
 
-    print("profile data, fused", profile_data)
-
     print(r"\begin{tabular}{lrrrr}")
     print(r"\toprule")
     print(r"Operator & Bytes Read & Bytes Written & Total & \% of Baseline \\")
@@ -984,10 +968,5 @@ def mem_ops_table():
 
 
 if __name__ == "__main__":
-    #test_stepper_equivalence()
-    #test_stepper_mem_ops(True)
-    #test_stepper_mem_ops(False)
-    #statement_counts_table()
-    #mem_ops_table()
-    #pass
-    test_stepper_timing(cl._csc, False)
+    statement_counts_table()
+    mem_ops_table()
diff --git a/setup.py b/setup.py
index 1f9ecbd0..e975a5cb 100644
--- a/setup.py
+++ b/setup.py
@@ -53,8 +53,8 @@ def main():
               "pymbolic>=2013.2",
               "loo.py>=2013.1beta",
               "cgen>=2013.1.2",
-              "leap>=2015.1",
-              "dagrt>=2015.1",
+              "leap>=2019.1",
+              "dagrt>=2019.1",
 
               "six>=1.8",
               ])
-- 
GitLab


From c1892738ba97685f22112a6a4ecb2db331e1c33a Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Tue, 23 Apr 2019 01:12:07 -0500
Subject: [PATCH 19/35] Add/modify results table code

---
 examples/dagrt_fusion/fusion-study.py | 218 ++++++++++++++++++++------
 1 file changed, 168 insertions(+), 50 deletions(-)
 mode change 100644 => 100755 examples/dagrt_fusion/fusion-study.py

diff --git a/examples/dagrt_fusion/fusion-study.py b/examples/dagrt_fusion/fusion-study.py
old mode 100644
new mode 100755
index 4af9756c..1fad3375
--- a/examples/dagrt_fusion/fusion-study.py
+++ b/examples/dagrt_fusion/fusion-study.py
@@ -44,6 +44,7 @@ import grudge.symbolic.operators as op
 from grudge.execution import ExecutionMapper
 from pymbolic.mapper.evaluator import EvaluationMapper \
         as PymbolicEvaluationMapper
+from pytools import memoize
 
 from grudge import sym, bind, DGDiscretizationWithBoundaries
 from leap.rk import LSRK4Method
@@ -552,8 +553,8 @@ class ExecutionMapperWithMemOpCounting(ExecutionMapper):
             if profile_data is not None and isinstance(val, pyopencl.array.Array):
                 profile_data["bytes_read"] = (
                         profile_data.get("bytes_read", 0) + val.nbytes)
-                profile_data["bytes_read_within_assignments"] = (
-                        profile_data.get("bytes_read_within_assignments", 0)
+                profile_data["bytes_read_by_scalar_assignments"] = (
+                        profile_data.get("bytes_read_by_scalar_assignments", 0)
                         + val.nbytes)
 
         discr = self.discrwb.discr_from_dd(kdescr.governing_dd)
@@ -577,8 +578,8 @@ class ExecutionMapperWithMemOpCounting(ExecutionMapper):
             if profile_data is not None and isinstance(val, pyopencl.array.Array):
                 profile_data["bytes_written"] = (
                         profile_data.get("bytes_written", 0) + val.nbytes)
-                profile_data["bytes_written_within_assignments"] = (
-                        profile_data.get("bytes_written_within_assignments", 0)
+                profile_data["bytes_written_by_scalar_assignments"] = (
+                        profile_data.get("bytes_written_by_scalar_assignments", 0)
                         + val.nbytes)
 
         return list(result_dict.items()), []
@@ -883,6 +884,20 @@ def get_example_stepper(queue, dims=2, order=3, use_fusion=True,
     return stepper
 
 
+def latex_table(table_format, header, rows):
+    result = []
+    _ = result.append
+    _(rf"\begin{{tabular}}{{{table_format}}}")
+    _(r"\toprule")
+    _(" & ".join(rf"\multicolumn{{1}}{{c}}{{{item}}}" for item in header) + r" \\")
+    _(r"\midrule")
+    for row in rows:
+        _(" & ".join(row) + r" \\")
+    _(r"\bottomrule")
+    _(r"\end{tabular}")
+    return "\n".join(result)
+
+
 def statement_counts_table():
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
@@ -890,36 +905,35 @@ def statement_counts_table():
     fused_stepper = get_example_stepper(queue, use_fusion=True)
     stepper = get_example_stepper(queue, use_fusion=False)
 
-    print(r"\begin{tabular}{lr}")
-    print(r"\toprule")
-    print(r"Operator & Grudge Node Count \\")
-    print(r"\midrule")
-    print(
-            r"Time integration (not fused) & %d \\"
-            % len(stepper.bound_op.eval_code.instructions))
-    print(
-            r"Right-hand side (not fused) & %d \\"
-            % len(stepper.grudge_bound_op.eval_code.instructions))
-    print(
-            r"Fused operator & %d \\"
-            % len(fused_stepper.bound_op.eval_code.instructions))
-    print(r"\bottomrule")
-    print(r"\end{tabular}")
-
-
-def mem_ops_table():
-    cl_ctx = cl.create_some_context()
-    queue = cl.CommandQueue(cl_ctx)
+    out_path = "statement-counts.tex"
+    outf = open(out_path, "w")
 
+    print(
+        latex_table(
+            "lr",
+            ("Operator", "Grudge Node Count"),
+            (
+                ("Time integration: baseline",
+                 r"\num{%d}" % len(stepper.bound_op.eval_code.instructions)),
+                ("Right-hand side: baseline",
+                 r"\num{%d}" % len(stepper.grudge_bound_op.eval_code.instructions)),
+                ("Inlined operator",
+                 r"\num{%d}" % len(fused_stepper.bound_op.eval_code.instructions))
+            )),
+        file=outf)
+
+
+@memoize(key=lambda queue, dims: dims)
+def mem_ops_results(queue, dims):
     fused_stepper = get_example_stepper(
             queue,
-            dims=3,
+            dims=dims,
             use_fusion=True,
             exec_mapper_factory=ExecutionMapperWithMemOpCounting)
 
     stepper, ic = get_example_stepper(
             queue,
-            dims=3,
+            dims=dims,
             use_fusion=False,
             exec_mapper_factory=ExecutionMapperWithMemOpCounting,
             return_ic=True)
@@ -928,45 +942,149 @@ def mem_ops_table():
     dt = 0.02
     t_end = 0.02
 
+    result = {}
+
     for (_, _, profile_data) in stepper.run(
             ic, t_start, dt, t_end, return_profile_data=True):
         pass
 
-    nonfused_bytes_read = profile_data["bytes_read"]
-    nonfused_bytes_written = profile_data["bytes_written"]
-    nonfused_bytes_total = nonfused_bytes_read + nonfused_bytes_written
+    result["nonfused_bytes_read"] = profile_data["bytes_read"]
+    result["nonfused_bytes_written"] = profile_data["bytes_written"]
+    result["nonfused_bytes_total"] = \
+            result["nonfused_bytes_read"] \
+            + result["nonfused_bytes_written"]
+
+    result["nonfused_bytes_read_by_scalar_assignments"] = \
+            profile_data["bytes_read_by_scalar_assignments"]
+    result["nonfused_bytes_written_by_scalar_assignments"] = \
+            profile_data["bytes_written_by_scalar_assignments"]
+    result["nonfused_bytes_total_by_scalar_assignments"] = \
+            result["nonfused_bytes_read_by_scalar_assignments"] \
+            + result["nonfused_bytes_written_by_scalar_assignments"]
 
     for (_, _, profile_data) in fused_stepper.run(
             ic, t_start, dt, t_end, return_profile_data=True):
         pass
 
-    fused_bytes_read = profile_data["bytes_read"]
-    fused_bytes_written = profile_data["bytes_written"]
-    fused_bytes_total = fused_bytes_read + fused_bytes_written
+    result["fused_bytes_read"] = profile_data["bytes_read"]
+    result["fused_bytes_written"] = profile_data["bytes_written"]
+    result["fused_bytes_total"] = \
+            result["fused_bytes_read"] \
+            + result["fused_bytes_written"]
+
+    result["fused_bytes_read_by_scalar_assignments"] = \
+            profile_data["bytes_read_by_scalar_assignments"]
+    result["fused_bytes_written_by_scalar_assignments"] = \
+            profile_data["bytes_written_by_scalar_assignments"]
+    result["fused_bytes_total_by_scalar_assignments"] = \
+            result["fused_bytes_read_by_scalar_assignments"] \
+            + result["fused_bytes_written_by_scalar_assignments"]
+
+    return result
+
+
+def scalar_assignment_percent_of_total_mem_ops_table():
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+
+    result2d = mem_ops_results(queue, 2)
+    result3d = mem_ops_results(queue, 3)
+
+    out_path = "scalar-assignments-mem-op-percentage.tex"
+    outf = open(out_path, "w")
 
-    print(r"\begin{tabular}{lrrrr}")
-    print(r"\toprule")
-    print(r"Operator & Bytes Read & Bytes Written & Total & \% of Baseline \\")
-    print(r"\midrule")
     print(
-            r"Baseline & \num{%d} & \num{%d} & \num{%d} & 100 \\"
-            % (
-                nonfused_bytes_read,
-                nonfused_bytes_written,
-                nonfused_bytes_total))
+        latex_table(
+            "lr",
+            ("Operator",
+             r"\parbox{1in}{\centering \% Memory Ops. Due to Scalar Assignments}"),
+            (
+                ("2D: Baseline",
+                 "%.1f" % (
+                        100 * result2d["nonfused_bytes_total_by_scalar_assignments"]
+                        / result2d["nonfused_bytes_total"])),
+                ("2D: Inlined",
+                 "%.1f" % (
+                        100 * result2d["fused_bytes_total_by_scalar_assignments"]
+                        / result2d["fused_bytes_total"])),
+                ("3D: Baseline",
+                 "%.1f" % (
+                        100 * result3d["nonfused_bytes_total_by_scalar_assignments"]
+                        / result3d["nonfused_bytes_total"])),
+                ("3D: Inlined",
+                  "%.1f" % (
+                        100 * result3d["fused_bytes_total_by_scalar_assignments"]
+                        / result3d["fused_bytes_total"])),
+            )),
+        file=outf)
+
+    logger.info(f"wrote to {out_path}")
+
+
+def scalar_assignment_effect_of_fusion_mem_ops_table():
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+
+    result2d = mem_ops_results(queue, 2)
+    result3d = mem_ops_results(queue, 3)
+
+    out_path = "scalar-assignments-mem-op-counts.tex"
+    outf = open(out_path, "w")
+
     print(
-            r"Fused & \num{%d} & \num{%d} & \num{%d} & \num{%.1f} \\"
-            % (
-                fused_bytes_read,
-                fused_bytes_written,
-                fused_bytes_total,
-                100 * fused_bytes_total / nonfused_bytes_total))
-    print(r"\bottomrule")
-    print(r"\end{tabular}")
+        latex_table(
+            "lrrrr",
+            ("Operator",
+             r"Bytes Read",
+             r"Bytes Written",
+             r"Total",
+             r"\% of Baseline"),
+            (
+                ("2D: Baseline",
+                 r"\num{%d}" % (
+                     result2d["nonfused_bytes_read_by_scalar_assignments"]),
+                 r"\num{%d}" % (
+                     result2d["nonfused_bytes_written_by_scalar_assignments"]),
+                 r"\num{%d}" % (
+                     result2d["nonfused_bytes_total_by_scalar_assignments"]),
+                 "100"),
+                ("2D: Inlined",
+                 r"\num{%d}" % (
+                     result2d["fused_bytes_read_by_scalar_assignments"]),
+                 r"\num{%d}" % (
+                     result2d["fused_bytes_written_by_scalar_assignments"]),
+                 r"\num{%d}" % (
+                     result2d["fused_bytes_total_by_scalar_assignments"]),
+                 r"%.1f" % (
+                     100 * result2d["fused_bytes_total_by_scalar_assignments"]
+                     / result2d["nonfused_bytes_total_by_scalar_assignments"])),
+                ("3D: Baseline",
+                 r"\num{%d}" % (
+                     result3d["nonfused_bytes_read_by_scalar_assignments"]),
+                 r"\num{%d}" % (
+                     result3d["nonfused_bytes_written_by_scalar_assignments"]),
+                 r"\num{%d}" % (
+                     result3d["nonfused_bytes_total_by_scalar_assignments"]),
+                 "100"),
+                ("3D: Inlined",
+                 r"\num{%d}" % (
+                     result3d["fused_bytes_read_by_scalar_assignments"]),
+                 r"\num{%d}" % (
+                     result3d["fused_bytes_written_by_scalar_assignments"]),
+                 r"\num{%d}" % (
+                     result3d["fused_bytes_total_by_scalar_assignments"]),
+                 r"%.1f" % (
+                     100 * result3d["fused_bytes_total_by_scalar_assignments"]
+                     / result3d["nonfused_bytes_total_by_scalar_assignments"])),
+            )),
+        file=outf)
+
+    logger.info(f"wrote to {out_path}")
 
 # }}}
 
 
 if __name__ == "__main__":
     statement_counts_table()
-    mem_ops_table()
+    scalar_assignment_percent_of_total_mem_ops_table()
+    scalar_assignment_effect_of_fusion_mem_ops_table()
-- 
GitLab


From dca590277526c97e1ccaf189db65f5283c0ca001 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Wed, 8 May 2019 18:32:46 -0500
Subject: [PATCH 20/35] Add problem stats gathering code

---
 examples/dagrt_fusion/fusion-study.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/examples/dagrt_fusion/fusion-study.py b/examples/dagrt_fusion/fusion-study.py
index 1fad3375..4f301c83 100755
--- a/examples/dagrt_fusion/fusion-study.py
+++ b/examples/dagrt_fusion/fusion-study.py
@@ -898,6 +898,24 @@ def latex_table(table_format, header, rows):
     return "\n".join(result)
 
 
+def problem_stats(order=3):
+    cl_ctx = cl.create_some_context()
+
+    _, dg_discr_2d = get_strong_wave_op_with_discr(cl_ctx, dims=2, order=order)
+    print("Number of 2D elements:", dg_discr_2d.mesh.nelements)
+    vol_discr_2d = dg_discr_2d.discr_from_dd("vol")
+    dofs_2d = set(group.nunit_nodes for group in vol_discr_2d.groups)
+    from pytools import one
+    print("Number of DOFs per 2D element:", one(dofs_2d))
+
+    _, dg_discr_3d = get_strong_wave_op_with_discr(cl_ctx, dims=3, order=order)
+    print("Number of 3D elements:", dg_discr_3d.mesh.nelements)
+    vol_discr_3d = dg_discr_3d.discr_from_dd("vol")
+    dofs_3d = set(group.nunit_nodes for group in vol_discr_3d.groups)
+    from pytools import one
+    print("Number of DOFs per 3D element:", one(dofs_3d))
+
+
 def statement_counts_table():
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
@@ -1085,6 +1103,7 @@ def scalar_assignment_effect_of_fusion_mem_ops_table():
 
 
 if __name__ == "__main__":
+    problem_stats()
     statement_counts_table()
     scalar_assignment_percent_of_total_mem_ops_table()
     scalar_assignment_effect_of_fusion_mem_ops_table()
-- 
GitLab


From ab9b445075357cfe0662099343d04d5cc8eef19a Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Wed, 8 May 2019 18:42:32 -0500
Subject: [PATCH 21/35] flake8 fix

---
 grudge/symbolic/compiler.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/grudge/symbolic/compiler.py b/grudge/symbolic/compiler.py
index 81a0af6a..b60fd6d1 100644
--- a/grudge/symbolic/compiler.py
+++ b/grudge/symbolic/compiler.py
@@ -1155,14 +1155,11 @@ class OperatorCompiler(mappers.IdentityMapper):
         from grudge.symbolic.dofdesc_inference import DOFDescInferenceMapper
         inf_mapper = DOFDescInferenceMapper(discr_code + eval_code)
 
-        if 1:
-            eval_code = aggregate_assignments(
+        eval_code = aggregate_assignments(
                 inf_mapper, eval_code, result, self.max_vectors_in_batch_expr)
 
-            discr_code = rewrite_insn_to_loopy_insns(inf_mapper, discr_code)
-            eval_code = rewrite_insn_to_loopy_insns(inf_mapper, eval_code)
-
-        code = Code(eval_code, result)
+        discr_code = rewrite_insn_to_loopy_insns(inf_mapper, discr_code)
+        eval_code = rewrite_insn_to_loopy_insns(inf_mapper, eval_code)
 
         from pytools.obj_array import make_obj_array
         return (
-- 
GitLab


From 11b1833150b9c35cc917c96108f63a69eee661ef Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Wed, 8 May 2019 19:15:10 -0500
Subject: [PATCH 22/35] Add a test for memory modeling

---
 examples/dagrt_fusion/fusion-study.py | 47 ++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 9 deletions(-)

diff --git a/examples/dagrt_fusion/fusion-study.py b/examples/dagrt_fusion/fusion-study.py
index 4f301c83..8dea5d27 100755
--- a/examples/dagrt_fusion/fusion-study.py
+++ b/examples/dagrt_fusion/fusion-study.py
@@ -168,11 +168,11 @@ def transcribe_phase(dag, field_var_name, field_components, phase_name,
                 raise NotImplementedError("lhs of statement %s is not a variable: %s"
                         % (stmt.id, stmt.lhs))
             ctx[stmt.lhs.name] = sym.cse(
-                DagrtToGrudgeRewriter(ctx)(stmt.rhs),
-                (
-                    stmt.lhs.name
-                    .replace("<", "")
-                    .replace(">", "")))
+                    DagrtToGrudgeRewriter(ctx)(stmt.rhs),
+                    (
+                        stmt.lhs.name
+                        .replace("<", "")
+                        .replace(">", "")))
 
         elif isinstance(stmt, lang.AssignFunctionCall):
             if stmt.function_id != rhs_name:
@@ -200,8 +200,11 @@ def transcribe_phase(dag, field_var_name, field_components, phase_name,
         elif isinstance(stmt, lang.YieldState):
             d2g = DagrtToGrudgeRewriter(ctx)
             yielded_states.append(
-                (stmt.time_id, d2g(stmt.time), stmt.component_id,
-                    d2g(stmt.expression)))
+                    (
+                        stmt.time_id,
+                        d2g(stmt.time),
+                        stmt.component_id,
+                        d2g(stmt.expression)))
 
         else:
             raise NotImplementedError("statement %s is of unsupported type ''%s'"
@@ -627,6 +630,32 @@ class ExecutionMapperWithMemOpCounting(ExecutionMapper):
 
 # {{{ mem op counter check
 
+def test_assignment_memory_model(ctx_factory):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+
+    _, discr = get_strong_wave_op_with_discr(cl_ctx, dims=2, order=3)
+
+    # Assignment instruction
+    bound_op = bind(
+            discr,
+            sym.Variable("input0", sym.DD_VOLUME)
+            + sym.Variable("input1", sym.DD_VOLUME),
+            exec_mapper_factory=ExecutionMapperWithMemOpCounting)
+
+    input0 = discr.zeros(queue)
+    input1 = discr.zeros(queue)
+
+    result, profile_data = bound_op(
+            queue,
+            profile_data={},
+            input0=input0,
+            input1=input1)
+
+    assert profile_data["bytes_read"] == input0.nbytes + input1.nbytes
+    assert profile_data["bytes_written"] == result.nbytes
+
+
 @pytest.mark.parametrize("use_fusion", (True, False))
 def test_stepper_mem_ops(ctx_factory, use_fusion):
     cl_ctx = ctx_factory()
@@ -904,14 +933,14 @@ def problem_stats(order=3):
     _, dg_discr_2d = get_strong_wave_op_with_discr(cl_ctx, dims=2, order=order)
     print("Number of 2D elements:", dg_discr_2d.mesh.nelements)
     vol_discr_2d = dg_discr_2d.discr_from_dd("vol")
-    dofs_2d = set(group.nunit_nodes for group in vol_discr_2d.groups)
+    dofs_2d = {group.nunit_nodes for group in vol_discr_2d.groups}
     from pytools import one
     print("Number of DOFs per 2D element:", one(dofs_2d))
 
     _, dg_discr_3d = get_strong_wave_op_with_discr(cl_ctx, dims=3, order=order)
     print("Number of 3D elements:", dg_discr_3d.mesh.nelements)
     vol_discr_3d = dg_discr_3d.discr_from_dd("vol")
-    dofs_3d = set(group.nunit_nodes for group in vol_discr_3d.groups)
+    dofs_3d = {group.nunit_nodes for group in vol_discr_3d.groups}
     from pytools import one
     print("Number of DOFs per 3D element:", one(dofs_3d))
 
-- 
GitLab


From 2404372065a023f804a65b1b79a32caa87a99dff Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Fri, 10 May 2019 16:07:43 -0500
Subject: [PATCH 23/35] Move unported exampels to unported-examples

---
 examples/conftest.py                                       | 7 -------
 examples/wave/var-propagation-speed.py                     | 6 +++---
 examples/wave/wave-min.py                                  | 2 +-
 {examples => unported-examples}/advection/advection.py     | 0
 {examples => unported-examples}/advection/var-velocity.py  | 0
 {examples => unported-examples}/advection/weak.py          | 0
 .../benchmark_grudge/benchmark_mpi.py                      | 0
 .../benchmark_grudge/run_benchmark.sh                      | 0
 {examples => unported-examples}/burgers/burgers.py         | 0
 {examples => unported-examples}/gas_dynamics/box-in-box.py | 0
 .../gas_dynamics/euler/sine-wave.py                        | 0
 .../gas_dynamics/euler/sod-2d.py                           | 0
 .../gas_dynamics/euler/vortex-adaptive-grid.py             | 0
 .../gas_dynamics/euler/vortex-sources.py                   | 0
 .../gas_dynamics/euler/vortex.py                           | 0
 .../gas_dynamics/gas_dynamics_initials.py                  | 0
 {examples => unported-examples}/gas_dynamics/lbm-simple.py | 0
 {examples => unported-examples}/gas_dynamics/naca.py       | 0
 .../gas_dynamics/navierstokes/shearflow.py                 | 0
 {examples => unported-examples}/gas_dynamics/square.py     | 0
 {examples => unported-examples}/gas_dynamics/wing.py       | 0
 {examples => unported-examples}/geometry.py                | 0
 {examples => unported-examples}/heat/heat.py               | 0
 {examples => unported-examples}/maxwell/.gitignore         | 0
 {examples => unported-examples}/maxwell/cavities.py        | 0
 {examples => unported-examples}/maxwell/dipole.py          | 0
 .../maxwell/generate-bessel-zeros.py                       | 0
 {examples => unported-examples}/maxwell/inhom-cavity.py    | 0
 .../maxwell/inhomogeneous_waveguide.mac                    | 0
 {examples => unported-examples}/maxwell/maxwell-2d.py      | 0
 {examples => unported-examples}/maxwell/maxwell-pml.py     | 0
 {examples => unported-examples}/maxwell/notes.tm           | 0
 {examples => unported-examples}/poisson/helmholtz.py       | 0
 {examples => unported-examples}/poisson/poisson.py         | 0
 34 files changed, 4 insertions(+), 11 deletions(-)
 delete mode 100644 examples/conftest.py
 rename {examples => unported-examples}/advection/advection.py (100%)
 rename {examples => unported-examples}/advection/var-velocity.py (100%)
 rename {examples => unported-examples}/advection/weak.py (100%)
 rename {examples => unported-examples}/benchmark_grudge/benchmark_mpi.py (100%)
 rename {examples => unported-examples}/benchmark_grudge/run_benchmark.sh (100%)
 rename {examples => unported-examples}/burgers/burgers.py (100%)
 rename {examples => unported-examples}/gas_dynamics/box-in-box.py (100%)
 rename {examples => unported-examples}/gas_dynamics/euler/sine-wave.py (100%)
 rename {examples => unported-examples}/gas_dynamics/euler/sod-2d.py (100%)
 rename {examples => unported-examples}/gas_dynamics/euler/vortex-adaptive-grid.py (100%)
 rename {examples => unported-examples}/gas_dynamics/euler/vortex-sources.py (100%)
 rename {examples => unported-examples}/gas_dynamics/euler/vortex.py (100%)
 rename {examples => unported-examples}/gas_dynamics/gas_dynamics_initials.py (100%)
 rename {examples => unported-examples}/gas_dynamics/lbm-simple.py (100%)
 rename {examples => unported-examples}/gas_dynamics/naca.py (100%)
 rename {examples => unported-examples}/gas_dynamics/navierstokes/shearflow.py (100%)
 rename {examples => unported-examples}/gas_dynamics/square.py (100%)
 rename {examples => unported-examples}/gas_dynamics/wing.py (100%)
 rename {examples => unported-examples}/geometry.py (100%)
 rename {examples => unported-examples}/heat/heat.py (100%)
 rename {examples => unported-examples}/maxwell/.gitignore (100%)
 rename {examples => unported-examples}/maxwell/cavities.py (100%)
 rename {examples => unported-examples}/maxwell/dipole.py (100%)
 rename {examples => unported-examples}/maxwell/generate-bessel-zeros.py (100%)
 rename {examples => unported-examples}/maxwell/inhom-cavity.py (100%)
 rename {examples => unported-examples}/maxwell/inhomogeneous_waveguide.mac (100%)
 rename {examples => unported-examples}/maxwell/maxwell-2d.py (100%)
 rename {examples => unported-examples}/maxwell/maxwell-pml.py (100%)
 rename {examples => unported-examples}/maxwell/notes.tm (100%)
 rename {examples => unported-examples}/poisson/helmholtz.py (100%)
 rename {examples => unported-examples}/poisson/poisson.py (100%)

diff --git a/examples/conftest.py b/examples/conftest.py
deleted file mode 100644
index 273b1c4a..00000000
--- a/examples/conftest.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# This file tells py.test to scan for test_xxx() functions in
-# any file below here, not just those named test_xxxx.py.
-
-
-def pytest_collect_file(path, parent):
-    if "grudge/examples" in str(path.dirpath()) and path.ext == ".py":
-        return parent.Module(path, parent=parent)
diff --git a/examples/wave/var-propagation-speed.py b/examples/wave/var-propagation-speed.py
index f161d034..a7a634f7 100644
--- a/examples/wave/var-propagation-speed.py
+++ b/examples/wave/var-propagation-speed.py
@@ -35,7 +35,7 @@ def main(write_output=True, order=4):
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
 
-    dims = 3
+    dims = 2
     from meshmode.mesh.generation import generate_regular_rect_mesh
     mesh = generate_regular_rect_mesh(
             a=(-0.5,)*dims,
@@ -90,7 +90,7 @@ def main(write_output=True, order=4):
 
     dt_stepper = set_up_rk4("w", dt, fields, rhs)
 
-    final_t = 10
+    final_t = 1
     nsteps = int(final_t/dt)
     print("dt=%g nsteps=%d" % (dt, nsteps))
 
@@ -113,7 +113,7 @@ def main(write_output=True, order=4):
             print(step, event.t, norm(queue, u=event.state_component[0]),
                     time()-t_last_step)
             if step % 10 == 0:
-                vis.write_vtk_file("fld-%04d.vtu" % step,
+                vis.write_vtk_file("fld-var-propogation-speed-%04d.vtu" % step,
                         [
                             ("u", event.state_component[0]),
                             ("v", event.state_component[1:]),
diff --git a/examples/wave/wave-min.py b/examples/wave/wave-min.py
index d7ebffd3..d793e3a0 100644
--- a/examples/wave/wave-min.py
+++ b/examples/wave/wave-min.py
@@ -113,7 +113,7 @@ def main(write_output=True, order=4):
             print(step, event.t, norm(queue, u=event.state_component[0]),
                     time()-t_last_step)
             if step % 10 == 0:
-                vis.write_vtk_file("fld-%04d.vtu" % step,
+                vis.write_vtk_file("fld-wave-min-%04d.vtu" % step,
                         [
                             ("u", event.state_component[0]),
                             ("v", event.state_component[1:]),
diff --git a/examples/advection/advection.py b/unported-examples/advection/advection.py
similarity index 100%
rename from examples/advection/advection.py
rename to unported-examples/advection/advection.py
diff --git a/examples/advection/var-velocity.py b/unported-examples/advection/var-velocity.py
similarity index 100%
rename from examples/advection/var-velocity.py
rename to unported-examples/advection/var-velocity.py
diff --git a/examples/advection/weak.py b/unported-examples/advection/weak.py
similarity index 100%
rename from examples/advection/weak.py
rename to unported-examples/advection/weak.py
diff --git a/examples/benchmark_grudge/benchmark_mpi.py b/unported-examples/benchmark_grudge/benchmark_mpi.py
similarity index 100%
rename from examples/benchmark_grudge/benchmark_mpi.py
rename to unported-examples/benchmark_grudge/benchmark_mpi.py
diff --git a/examples/benchmark_grudge/run_benchmark.sh b/unported-examples/benchmark_grudge/run_benchmark.sh
similarity index 100%
rename from examples/benchmark_grudge/run_benchmark.sh
rename to unported-examples/benchmark_grudge/run_benchmark.sh
diff --git a/examples/burgers/burgers.py b/unported-examples/burgers/burgers.py
similarity index 100%
rename from examples/burgers/burgers.py
rename to unported-examples/burgers/burgers.py
diff --git a/examples/gas_dynamics/box-in-box.py b/unported-examples/gas_dynamics/box-in-box.py
similarity index 100%
rename from examples/gas_dynamics/box-in-box.py
rename to unported-examples/gas_dynamics/box-in-box.py
diff --git a/examples/gas_dynamics/euler/sine-wave.py b/unported-examples/gas_dynamics/euler/sine-wave.py
similarity index 100%
rename from examples/gas_dynamics/euler/sine-wave.py
rename to unported-examples/gas_dynamics/euler/sine-wave.py
diff --git a/examples/gas_dynamics/euler/sod-2d.py b/unported-examples/gas_dynamics/euler/sod-2d.py
similarity index 100%
rename from examples/gas_dynamics/euler/sod-2d.py
rename to unported-examples/gas_dynamics/euler/sod-2d.py
diff --git a/examples/gas_dynamics/euler/vortex-adaptive-grid.py b/unported-examples/gas_dynamics/euler/vortex-adaptive-grid.py
similarity index 100%
rename from examples/gas_dynamics/euler/vortex-adaptive-grid.py
rename to unported-examples/gas_dynamics/euler/vortex-adaptive-grid.py
diff --git a/examples/gas_dynamics/euler/vortex-sources.py b/unported-examples/gas_dynamics/euler/vortex-sources.py
similarity index 100%
rename from examples/gas_dynamics/euler/vortex-sources.py
rename to unported-examples/gas_dynamics/euler/vortex-sources.py
diff --git a/examples/gas_dynamics/euler/vortex.py b/unported-examples/gas_dynamics/euler/vortex.py
similarity index 100%
rename from examples/gas_dynamics/euler/vortex.py
rename to unported-examples/gas_dynamics/euler/vortex.py
diff --git a/examples/gas_dynamics/gas_dynamics_initials.py b/unported-examples/gas_dynamics/gas_dynamics_initials.py
similarity index 100%
rename from examples/gas_dynamics/gas_dynamics_initials.py
rename to unported-examples/gas_dynamics/gas_dynamics_initials.py
diff --git a/examples/gas_dynamics/lbm-simple.py b/unported-examples/gas_dynamics/lbm-simple.py
similarity index 100%
rename from examples/gas_dynamics/lbm-simple.py
rename to unported-examples/gas_dynamics/lbm-simple.py
diff --git a/examples/gas_dynamics/naca.py b/unported-examples/gas_dynamics/naca.py
similarity index 100%
rename from examples/gas_dynamics/naca.py
rename to unported-examples/gas_dynamics/naca.py
diff --git a/examples/gas_dynamics/navierstokes/shearflow.py b/unported-examples/gas_dynamics/navierstokes/shearflow.py
similarity index 100%
rename from examples/gas_dynamics/navierstokes/shearflow.py
rename to unported-examples/gas_dynamics/navierstokes/shearflow.py
diff --git a/examples/gas_dynamics/square.py b/unported-examples/gas_dynamics/square.py
similarity index 100%
rename from examples/gas_dynamics/square.py
rename to unported-examples/gas_dynamics/square.py
diff --git a/examples/gas_dynamics/wing.py b/unported-examples/gas_dynamics/wing.py
similarity index 100%
rename from examples/gas_dynamics/wing.py
rename to unported-examples/gas_dynamics/wing.py
diff --git a/examples/geometry.py b/unported-examples/geometry.py
similarity index 100%
rename from examples/geometry.py
rename to unported-examples/geometry.py
diff --git a/examples/heat/heat.py b/unported-examples/heat/heat.py
similarity index 100%
rename from examples/heat/heat.py
rename to unported-examples/heat/heat.py
diff --git a/examples/maxwell/.gitignore b/unported-examples/maxwell/.gitignore
similarity index 100%
rename from examples/maxwell/.gitignore
rename to unported-examples/maxwell/.gitignore
diff --git a/examples/maxwell/cavities.py b/unported-examples/maxwell/cavities.py
similarity index 100%
rename from examples/maxwell/cavities.py
rename to unported-examples/maxwell/cavities.py
diff --git a/examples/maxwell/dipole.py b/unported-examples/maxwell/dipole.py
similarity index 100%
rename from examples/maxwell/dipole.py
rename to unported-examples/maxwell/dipole.py
diff --git a/examples/maxwell/generate-bessel-zeros.py b/unported-examples/maxwell/generate-bessel-zeros.py
similarity index 100%
rename from examples/maxwell/generate-bessel-zeros.py
rename to unported-examples/maxwell/generate-bessel-zeros.py
diff --git a/examples/maxwell/inhom-cavity.py b/unported-examples/maxwell/inhom-cavity.py
similarity index 100%
rename from examples/maxwell/inhom-cavity.py
rename to unported-examples/maxwell/inhom-cavity.py
diff --git a/examples/maxwell/inhomogeneous_waveguide.mac b/unported-examples/maxwell/inhomogeneous_waveguide.mac
similarity index 100%
rename from examples/maxwell/inhomogeneous_waveguide.mac
rename to unported-examples/maxwell/inhomogeneous_waveguide.mac
diff --git a/examples/maxwell/maxwell-2d.py b/unported-examples/maxwell/maxwell-2d.py
similarity index 100%
rename from examples/maxwell/maxwell-2d.py
rename to unported-examples/maxwell/maxwell-2d.py
diff --git a/examples/maxwell/maxwell-pml.py b/unported-examples/maxwell/maxwell-pml.py
similarity index 100%
rename from examples/maxwell/maxwell-pml.py
rename to unported-examples/maxwell/maxwell-pml.py
diff --git a/examples/maxwell/notes.tm b/unported-examples/maxwell/notes.tm
similarity index 100%
rename from examples/maxwell/notes.tm
rename to unported-examples/maxwell/notes.tm
diff --git a/examples/poisson/helmholtz.py b/unported-examples/poisson/helmholtz.py
similarity index 100%
rename from examples/poisson/helmholtz.py
rename to unported-examples/poisson/helmholtz.py
diff --git a/examples/poisson/poisson.py b/unported-examples/poisson/poisson.py
similarity index 100%
rename from examples/poisson/poisson.py
rename to unported-examples/poisson/poisson.py
-- 
GitLab


From 5e4c7c687344a9297fc45934ced628db2074b590 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Fri, 10 May 2019 16:11:38 -0500
Subject: [PATCH 24/35] Move wiggly to unported examples

---
 {examples => unported-examples}/wave/wiggly.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename {examples => unported-examples}/wave/wiggly.py (100%)

diff --git a/examples/wave/wiggly.py b/unported-examples/wave/wiggly.py
similarity index 100%
rename from examples/wave/wiggly.py
rename to unported-examples/wave/wiggly.py
-- 
GitLab


From 1f3ec9d2ff88c2f90f883a3541cb529c977e1f55 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Fri, 10 May 2019 16:11:42 -0500
Subject: [PATCH 25/35] Change output file name

---
 examples/wave/wave-min-mpi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/wave/wave-min-mpi.py b/examples/wave/wave-min-mpi.py
index 04d0b8a3..08705655 100644
--- a/examples/wave/wave-min-mpi.py
+++ b/examples/wave/wave-min-mpi.py
@@ -134,7 +134,7 @@ def main(write_output=True, order=4):
                     time()-t_last_step)
             if step % 10 == 0:
                 vis.write_vtk_file(
-                        "fld-%03d-%04d.vtu" % (
+                        "fld-wave-min-mpi-%03d-%04d.vtu" % (
                             rank,
                             step,
                             ),
-- 
GitLab


From f70f2645ea852c863ac429e358c244cf9d8091ef Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Fri, 10 May 2019 16:21:08 -0500
Subject: [PATCH 26/35] Change fusion-study to run as an example

---
 examples/dagrt_fusion/fusion-study.py | 44 +++++++++++++++++++--------
 1 file changed, 31 insertions(+), 13 deletions(-)

diff --git a/examples/dagrt_fusion/fusion-study.py b/examples/dagrt_fusion/fusion-study.py
index 8dea5d27..e49aecab 100755
--- a/examples/dagrt_fusion/fusion-study.py
+++ b/examples/dagrt_fusion/fusion-study.py
@@ -33,6 +33,7 @@ THE SOFTWARE.
 import logging
 import numpy as np
 import six
+import sys
 import pyopencl as cl
 import pyopencl.array  # noqa
 import pytest
@@ -57,7 +58,8 @@ logging.basicConfig(level=logging.INFO)
 
 logger = logging.getLogger(__name__)
 
-logger.setLevel(logging.DEBUG)
+
+PRINT_RESULTS_TO_STDOUT = True
 
 
 # {{{ topological sort
@@ -381,7 +383,7 @@ def get_strong_wave_op_with_discr(cl_ctx, dims=2, order=4):
             b=(0.5,)*dims,
             n=(16,)*dims)
 
-    logger.debug("%d elements" % mesh.nelements)
+    logger.debug("%d elements", mesh.nelements)
 
     discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order)
 
@@ -952,8 +954,11 @@ def statement_counts_table():
     fused_stepper = get_example_stepper(queue, use_fusion=True)
     stepper = get_example_stepper(queue, use_fusion=False)
 
-    out_path = "statement-counts.tex"
-    outf = open(out_path, "w")
+    if PRINT_RESULTS_TO_STDOUT:
+        outf = sys.stdout
+    else:
+        out_path = "statement-counts.tex"
+        outf = open(out_path, "w")
 
     print(
         latex_table(
@@ -1037,8 +1042,11 @@ def scalar_assignment_percent_of_total_mem_ops_table():
     result2d = mem_ops_results(queue, 2)
     result3d = mem_ops_results(queue, 3)
 
-    out_path = "scalar-assignments-mem-op-percentage.tex"
-    outf = open(out_path, "w")
+    if PRINT_RESULTS_TO_STDOUT:
+        outf = sys.stdout
+    else:
+        out_path = "scalar-assignments-mem-op-percentage.tex"
+        outf = open(out_path, "w")
 
     print(
         latex_table(
@@ -1065,8 +1073,6 @@ def scalar_assignment_percent_of_total_mem_ops_table():
             )),
         file=outf)
 
-    logger.info(f"wrote to {out_path}")
-
 
 def scalar_assignment_effect_of_fusion_mem_ops_table():
     cl_ctx = cl.create_some_context()
@@ -1075,8 +1081,11 @@ def scalar_assignment_effect_of_fusion_mem_ops_table():
     result2d = mem_ops_results(queue, 2)
     result3d = mem_ops_results(queue, 3)
 
-    out_path = "scalar-assignments-mem-op-counts.tex"
-    outf = open(out_path, "w")
+    if PRINT_RESULTS_TO_STDOUT:
+        outf = sys.stdout
+    else:
+        out_path = "scalar-assignments-mem-op-percentage.tex"
+        outf = open(out_path, "w")
 
     print(
         latex_table(
@@ -1126,13 +1135,22 @@ def scalar_assignment_effect_of_fusion_mem_ops_table():
             )),
         file=outf)
 
-    logger.info(f"wrote to {out_path}")
-
 # }}}
 
 
-if __name__ == "__main__":
+def main():
+    if 1:
+        # Run tests.
+        from py.test import main
+        result = main([__file__])
+        assert result == 0
+
+    # Run examples.
     problem_stats()
     statement_counts_table()
     scalar_assignment_percent_of_total_mem_ops_table()
     scalar_assignment_effect_of_fusion_mem_ops_table()
+
+
+if __name__ == "__main__":
+    main()
-- 
GitLab


From c40a5c66b069fc709047b5c461a12d85b6d0fe99 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Fri, 10 May 2019 16:21:17 -0500
Subject: [PATCH 27/35] Add examples build

---
 .gitlab-ci.yml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index fcee7997..1ac04133 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -68,6 +68,23 @@ Python 3 POCL MPI:
     reports:
       junit: test/pytest.xml
 
+Python 3 POCL Examples:
+  script:
+  - export PY_EXE=python3
+  - export PYOPENCL_TEST=portable
+  - export EXTRA_INSTALL="pybind11 numpy mako mpi4py"
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-py-project-and-run-examples.sh
+  - ". ./build-py-project-and-run-examples.sh"
+  tags:
+  - python3
+  - pocl
+  - large-node
+  except:
+  - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 Documentation:
   script:
   - EXTRA_INSTALL="pybind11 numpy"
-- 
GitLab


From 5ecfd75e587380bf6087411c39adf8e26f9654a1 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Fri, 10 May 2019 16:29:00 -0500
Subject: [PATCH 28/35] Install pyvisfile

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 1ac04133..ac16577a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -72,7 +72,7 @@ Python 3 POCL Examples:
   script:
   - export PY_EXE=python3
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="pybind11 numpy mako mpi4py"
+  - export EXTRA_INSTALL="pybind11 numpy mako mpi4py pyvisfile"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-py-project-and-run-examples.sh
   - ". ./build-py-project-and-run-examples.sh"
   tags:
-- 
GitLab


From 13c913908df4ac237cbe718572547c64069f8c20 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Fri, 10 May 2019 16:30:39 -0500
Subject: [PATCH 29/35] Print table titles

---
 examples/dagrt_fusion/fusion-study.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/dagrt_fusion/fusion-study.py b/examples/dagrt_fusion/fusion-study.py
index e49aecab..ff467d66 100755
--- a/examples/dagrt_fusion/fusion-study.py
+++ b/examples/dagrt_fusion/fusion-study.py
@@ -955,6 +955,7 @@ def statement_counts_table():
     stepper = get_example_stepper(queue, use_fusion=False)
 
     if PRINT_RESULTS_TO_STDOUT:
+        print("==== Statement Counts ====")
         outf = sys.stdout
     else:
         out_path = "statement-counts.tex"
@@ -1043,6 +1044,7 @@ def scalar_assignment_percent_of_total_mem_ops_table():
     result3d = mem_ops_results(queue, 3)
 
     if PRINT_RESULTS_TO_STDOUT:
+        print("==== Scalar Assigment % of Total Mem Ops ====")
         outf = sys.stdout
     else:
         out_path = "scalar-assignments-mem-op-percentage.tex"
@@ -1082,6 +1084,7 @@ def scalar_assignment_effect_of_fusion_mem_ops_table():
     result3d = mem_ops_results(queue, 3)
 
     if PRINT_RESULTS_TO_STDOUT:
+        print("==== Scalar Assigment Inlining Impact ====")
         outf = sys.stdout
     else:
         out_path = "scalar-assignments-mem-op-percentage.tex"
-- 
GitLab


From e29e0abbf56a094fe20dca8861d60890e3885181 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Fri, 10 May 2019 16:39:37 -0500
Subject: [PATCH 30/35] Install pymetis; nicer table formatting

---
 .gitlab-ci.yml                        |  2 +-
 examples/dagrt_fusion/fusion-study.py | 30 ++++++++++++++++++++++++---
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ac16577a..373083d5 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -72,7 +72,7 @@ Python 3 POCL Examples:
   script:
   - export PY_EXE=python3
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="pybind11 numpy mako mpi4py pyvisfile"
+  - export EXTRA_INSTALL="pybind11 numpy mako mpi4py pyvisfile pymetis"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-py-project-and-run-examples.sh
   - ". ./build-py-project-and-run-examples.sh"
   tags:
diff --git a/examples/dagrt_fusion/fusion-study.py b/examples/dagrt_fusion/fusion-study.py
index ff467d66..64d0839f 100755
--- a/examples/dagrt_fusion/fusion-study.py
+++ b/examples/dagrt_fusion/fusion-study.py
@@ -929,6 +929,30 @@ def latex_table(table_format, header, rows):
     return "\n".join(result)
 
 
+def ascii_table(table_format, header, rows):
+    from pytools import Table
+    table = Table()
+    table.add_row(header)
+
+    for input_row in rows:
+        row = []
+        for item in input_row:
+            if item.startswith(r"\num{"):
+                # Strip \num{...} formatting
+                row.append(item[5:-1])
+            else:
+                row.append(item)
+        table.add_row(row)
+
+    return str(table)
+
+
+if PRINT_RESULTS_TO_STDOUT:
+    table = ascii_table
+else:
+    table = latex_table
+
+
 def problem_stats(order=3):
     cl_ctx = cl.create_some_context()
 
@@ -962,7 +986,7 @@ def statement_counts_table():
         outf = open(out_path, "w")
 
     print(
-        latex_table(
+        table(
             "lr",
             ("Operator", "Grudge Node Count"),
             (
@@ -1051,7 +1075,7 @@ def scalar_assignment_percent_of_total_mem_ops_table():
         outf = open(out_path, "w")
 
     print(
-        latex_table(
+        table(
             "lr",
             ("Operator",
              r"\parbox{1in}{\centering \% Memory Ops. Due to Scalar Assignments}"),
@@ -1091,7 +1115,7 @@ def scalar_assignment_effect_of_fusion_mem_ops_table():
         outf = open(out_path, "w")
 
     print(
-        latex_table(
+        table(
             "lrrrr",
             ("Operator",
              r"Bytes Read",
-- 
GitLab


From 37a54cae2ceb1755f10bf793d14617c92ca9e9ef Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sun, 12 May 2019 20:55:13 -0500
Subject: [PATCH 31/35] Update example to work with function registry

---
 examples/dagrt_fusion/fusion-study.py | 24 ++++--------------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/examples/dagrt_fusion/fusion-study.py b/examples/dagrt_fusion/fusion-study.py
index 7554ccc2..be46a2a8 100755
--- a/examples/dagrt_fusion/fusion-study.py
+++ b/examples/dagrt_fusion/fusion-study.py
@@ -327,19 +327,15 @@ class RK4TimeStepper(RK4TimeStepperBase):
         """
         super().__init__(queue, component_getter)
 
-        from pymbolic import var
-
         # Construct sym_rhs to have the effect of replacing the RHS calls in the
         # dagrt code with calls of the grudge operator.
         from grudge.symbolic.primitives import FunctionSymbol, Variable
-        call = sym.cse(ExternalCall(
-                FunctionSymbol("grudge_op"),
-                (
+        call = sym.cse(
+                FunctionSymbol("grudge_op")(*(
                     (Variable("t", dd=sym.DD_SCALAR),)
                     + tuple(
                         Variable(field_var_name, dd=sym.DD_VOLUME)[i]
-                        for i in range(num_fields))),
-                dd=sym.DD_VOLUME))
+                        for i in range(num_fields)))))
 
         from pytools.obj_array import join_fields
         sym_rhs = join_fields(*(call[i] for i in range(num_fields)))
@@ -347,7 +343,7 @@ class RK4TimeStepper(RK4TimeStepperBase):
         self.queue = queue
         self.grudge_bound_op = grudge_bound_op
 
-        from dagrt.function_registry import register_external_function
+        from grudge.function_registry import register_external_function
 
         freg = register_external_function(
                 base_function_registry,
@@ -495,10 +491,6 @@ class ExecutionMapperWithMemOpCounting(ExecutionMapper):
     def __init__(self, queue, context, bound_op):
         super().__init__(queue, context, bound_op)
 
-    def map_call(self, expr):
-        # Should have been caught by our op counter
-        assert False, ("map_call called: %s" % expr)
-
     # {{{ expressions
 
     def map_profiled_call(self, expr, profile_data):
@@ -791,14 +783,6 @@ def time_insn(f):
 
 class ExecutionMapperWithTiming(ExecutionMapper):
 
-    def map_call(self, expr):
-        # Should have been caught by our implementation.
-        assert False, ("map_call called: %s" % (expr))
-
-    def map_operator_binding(self, expr):
-        # Should have been caught by our implementation.
-        assert False, ("map_operator_binding called: %s" % expr)
-
     def map_profiled_call(self, expr, profile_data):
         args = [self.rec(p) for p in expr.parameters]
         return self.function_registry[expr.function.name](
-- 
GitLab


From 125229c50b5b8f45e2d5326367e91f569080a56a Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sun, 12 May 2019 20:56:23 -0500
Subject: [PATCH 32/35] Whitespace fix

---
 grudge/execution.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/grudge/execution.py b/grudge/execution.py
index 66e483f3..4fe592dd 100644
--- a/grudge/execution.py
+++ b/grudge/execution.py
@@ -596,7 +596,7 @@ def process_sym_operator(discrwb, sym_operator, post_bind_mapper=None,
 def bind(discr, sym_operator, post_bind_mapper=lambda x: x,
         function_registry=base_function_registry,
         exec_mapper_factory=ExecutionMapper,
-        debug_flags=frozenset(), allocator=None):   
+        debug_flags=frozenset(), allocator=None):
     # from grudge.symbolic.mappers import QuadratureUpsamplerRemover
     # sym_operator = QuadratureUpsamplerRemover(self.quad_min_degrees)(
     #         sym_operator)
@@ -624,9 +624,9 @@ def bind(discr, sym_operator, post_bind_mapper=lambda x: x,
     bound_op = BoundOperator(discr, discr_code, eval_code,
             function_registry=function_registry,
             exec_mapper_factory=exec_mapper_factory,
-            debug_flags=debug_flags,                             
+            debug_flags=debug_flags,
             allocator=allocator)
-    
+
     if "dump_op_code" in debug_flags:
         from grudge.tools import open_unique_debug_file
         outf, _ = open_unique_debug_file("op-code", ".txt")
-- 
GitLab


From cc2a092bceb77e7f1c7b91e1fc8501b6c19bdcf8 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sun, 12 May 2019 22:47:40 -0500
Subject: [PATCH 33/35] Port over some examples

---
 .../advection/var-velocity.py                       | 12 +++++++-----
 {unported-examples => examples}/advection/weak.py   |  2 +-
 .../fusion-study.py => dagrt-fusion.py}             |  0
 {unported-examples => examples}/geometry.py         |  6 +++---
 {unported-examples => examples}/maxwell/cavities.py | 13 ++++++++-----
 5 files changed, 19 insertions(+), 14 deletions(-)
 rename {unported-examples => examples}/advection/var-velocity.py (94%)
 rename {unported-examples => examples}/advection/weak.py (98%)
 rename examples/{dagrt_fusion/fusion-study.py => dagrt-fusion.py} (100%)
 rename {unported-examples => examples}/geometry.py (92%)
 rename {unported-examples => examples}/maxwell/cavities.py (92%)

diff --git a/unported-examples/advection/var-velocity.py b/examples/advection/var-velocity.py
similarity index 94%
rename from unported-examples/advection/var-velocity.py
rename to examples/advection/var-velocity.py
index 00e4f0c8..bc720731 100644
--- a/unported-examples/advection/var-velocity.py
+++ b/examples/advection/var-velocity.py
@@ -40,6 +40,9 @@ from grudge import sym, bind, DGDiscretizationWithBoundaries
 from pytools.obj_array import join_fields
 
 
+FINAL_TIME = 5
+
+
 def main(write_output=True, order=4):
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
@@ -105,10 +108,9 @@ def main(write_output=True, order=4):
     def rhs(t, u):
         return bound_op(queue, t=t, u=u)
 
-    final_time = 50
     dt = dt_factor * h/order**2
-    nsteps = (final_time // dt) + 1
-    dt = final_time/nsteps + 1e-15
+    nsteps = (FINAL_TIME // dt) + 1
+    dt = FINAL_TIME/nsteps + 1e-15
 
     from grudge.shortcuts import set_up_rk4
     dt_stepper = set_up_rk4("u", dt, u, rhs)
@@ -118,14 +120,14 @@ def main(write_output=True, order=4):
 
     step = 0
 
-    for event in dt_stepper.run(t_end=final_time):
+    for event in dt_stepper.run(t_end=FINAL_TIME):
         if isinstance(event, dt_stepper.StateComputed):
 
             step += 1
             if step % 30 == 0:
                 print(step)
 
-                vis.write_vtk_file("fld-%04d.vtu" % step,
+                vis.write_vtk_file("fld-var-velocity-%04d.vtu" % step,
                         [("u", event.state_component)])
 
 
diff --git a/unported-examples/advection/weak.py b/examples/advection/weak.py
similarity index 98%
rename from unported-examples/advection/weak.py
rename to examples/advection/weak.py
index cdadb49e..bd9d61ca 100644
--- a/unported-examples/advection/weak.py
+++ b/examples/advection/weak.py
@@ -103,7 +103,7 @@ def main(write_output=True, order=4):
             step += 1
 
             #print(step, event.t, norm(queue, u=event.state_component[0]))
-            vis.write_vtk_file("fld-%04d.vtu" % step,
+            vis.write_vtk_file("fld-weak-%04d.vtu" % step,
                     [  ("u", event.state_component) ])
 
 
diff --git a/examples/dagrt_fusion/fusion-study.py b/examples/dagrt-fusion.py
similarity index 100%
rename from examples/dagrt_fusion/fusion-study.py
rename to examples/dagrt-fusion.py
diff --git a/unported-examples/geometry.py b/examples/geometry.py
similarity index 92%
rename from unported-examples/geometry.py
rename to examples/geometry.py
index e10254bf..6e146f21 100644
--- a/unported-examples/geometry.py
+++ b/examples/geometry.py
@@ -26,7 +26,7 @@ THE SOFTWARE.
 
 import numpy as np  # noqa
 import pyopencl as cl
-from grudge import sym, bind, Discretization, shortcuts
+from grudge import sym, bind, DGDiscretizationWithBoundaries, shortcuts
 
 
 def main(write_output=True):
@@ -36,14 +36,14 @@ def main(write_output=True):
     from meshmode.mesh.generation import generate_warped_rect_mesh
     mesh = generate_warped_rect_mesh(dim=2, order=4, n=6)
 
-    discr = Discretization(cl_ctx, mesh, order=4)
+    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=4)
 
     sym_op = sym.normal(sym.BTAG_ALL, mesh.dim)
     #sym_op = sym.nodes(mesh.dim, where=sym.BTAG_ALL)
     print(sym.pretty(sym_op))
     op = bind(discr, sym_op)
     print()
-    print(op.code)
+    print(op.eval_code)
 
     vec = op(queue)
 
diff --git a/unported-examples/maxwell/cavities.py b/examples/maxwell/cavities.py
similarity index 92%
rename from unported-examples/maxwell/cavities.py
rename to examples/maxwell/cavities.py
index 85a7c3de..a58f2739 100644
--- a/unported-examples/maxwell/cavities.py
+++ b/examples/maxwell/cavities.py
@@ -28,11 +28,14 @@ THE SOFTWARE.
 import numpy as np
 import pyopencl as cl
 from grudge.shortcuts import set_up_rk4
-from grudge import sym, bind, Discretization
+from grudge import sym, bind, DGDiscretizationWithBoundaries
 
 from grudge.models.em import get_rectangular_cavity_mode
 
 
+STEPS = 60
+
+
 def main(dims, write_output=True, order=4):
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
@@ -43,7 +46,7 @@ def main(dims, write_output=True, order=4):
             b=(1.0,)*dims,
             n=(5,)*dims)
 
-    discr = Discretization(cl_ctx, mesh, order=order)
+    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order)
 
     if 0:
         epsilon0 = 8.8541878176e-12  # C**2 / (N m**2)
@@ -84,7 +87,7 @@ def main(dims, write_output=True, order=4):
 
     dt_stepper = set_up_rk4("w", dt, fields, rhs)
 
-    final_t = dt * 600
+    final_t = dt * STEPS
     nsteps = int(final_t/dt)
 
     print("dt=%g nsteps=%d" % (dt, nsteps))
@@ -102,7 +105,7 @@ def main(dims, write_output=True, order=4):
     e, h = op.split_eh(fields)
 
     if 1:
-        vis.write_vtk_file("fld-%04d.vtu" % step,
+        vis.write_vtk_file("fld-cavities-%04d.vtu" % step,
                 [
                     ("e", e),
                     ("h", h),
@@ -119,7 +122,7 @@ def main(dims, write_output=True, order=4):
                     time()-t_last_step)
             if step % 10 == 0:
                 e, h = op.split_eh(event.state_component)
-                vis.write_vtk_file("fld-%04d.vtu" % step,
+                vis.write_vtk_file("fld-cavities-%04d.vtu" % step,
                         [
                             ("e", e),
                             ("h", h),
-- 
GitLab


From 34d4f87851ab28b8fd612e2e6549c59c871f0a83 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Mon, 13 May 2019 06:04:34 +0200
Subject: [PATCH 34/35] Add FunctionSymbol to primitives documentation

---
 grudge/symbolic/primitives.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/grudge/symbolic/primitives.py b/grudge/symbolic/primitives.py
index c59f1518..63adf3db 100644
--- a/grudge/symbolic/primitives.py
+++ b/grudge/symbolic/primitives.py
@@ -76,6 +76,7 @@ Symbols
 .. autoclass:: ScalarVariable
 .. autoclass:: make_sym_array
 .. autoclass:: make_sym_mv
+.. autoclass:: FunctionSymbol
 
 .. function :: sqrt(arg)
 .. function :: exp(arg)
-- 
GitLab


From bb4bf3f44afddc6af8167ecc5a55f08f9e3cc6fb Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Thu, 13 Jun 2019 00:41:05 -0500
Subject: [PATCH 35/35] Use a wrapped version of the execution mapper to ensure
 all top-level instructions are caught

---
 examples/dagrt-fusion.py | 79 +++++++++++++++++++++++++++-------------
 1 file changed, 54 insertions(+), 25 deletions(-)

diff --git a/examples/dagrt-fusion.py b/examples/dagrt-fusion.py
index be46a2a8..a69f936c 100755
--- a/examples/dagrt-fusion.py
+++ b/examples/dagrt-fusion.py
@@ -44,6 +44,7 @@ import grudge.symbolic.mappers as gmap
 import grudge.symbolic.operators as op
 from grudge.execution import ExecutionMapper
 from grudge.function_registry import base_function_registry
+from pymbolic.mapper import Mapper
 from pymbolic.mapper.evaluator import EvaluationMapper \
         as PymbolicEvaluationMapper
 from pytools import memoize
@@ -482,32 +483,50 @@ def test_stepper_equivalence(ctx_factory, order=4):
 # }}}
 
 
+# {{{ execution mapper wrapper
+
+class ExecutionMapperWrapper(Mapper):
+
+    def __init__(self, queue, context, bound_op):
+        self.inner_mapper = ExecutionMapper(queue, context, bound_op)
+        self.queue = queue
+        self.context = context
+        self.bound_op = bound_op
+
+    def map_variable(self, expr):
+        # Needed, because bound op execution can ask for variable values.
+        return self.inner_mapper.map_variable(expr)
+
+    def map_grudge_variable(self, expr):
+        # See map_variable()
+        return self.inner_mapper.map_grudge_variable(expr)
+
+# }}}
+
+
 # {{{ mem op counter implementation
 
-class ExecutionMapperWithMemOpCounting(ExecutionMapper):
+class ExecutionMapperWithMemOpCounting(ExecutionMapperWrapper):
     # This is a skeleton implementation that only has just enough functionality
     # for the wave-min example to work.
 
-    def __init__(self, queue, context, bound_op):
-        super().__init__(queue, context, bound_op)
-
     # {{{ expressions
 
     def map_profiled_call(self, expr, profile_data):
-        args = [self.rec(p) for p in expr.parameters]
-        return self.function_registry[expr.function.name](
+        args = [self.inner_mapper.rec(p) for p in expr.parameters]
+        return self.inner_mapper.function_registry[expr.function.name](
                 self.queue, *args, profile_data=profile_data)
 
     def map_profiled_essentially_elementwise_linear(self, op, field_expr,
                                                     profile_data):
-        result = getattr(self, op.mapper_method)(op, field_expr)
+        result = getattr(self.inner_mapper, op.mapper_method)(op, field_expr)
 
         if profile_data is not None:
             # We model the cost to load the input and write the output.  In
             # particular, we assume the elementwise matrices are negligible in
             # size and thus ignorable.
 
-            field = self.rec(field_expr)
+            field = self.inner_mapper.rec(field_expr)
             profile_data["bytes_read"] = (
                     profile_data.get("bytes_read", 0) + field.nbytes)
             profile_data["bytes_written"] = (
@@ -547,7 +566,7 @@ class ExecutionMapperWithMemOpCounting(ExecutionMapper):
 
         else:
             logger.debug("assignment not profiled: %s", expr)
-            val = self.rec(expr)
+            val = self.inner_mapper.rec(expr)
 
         return val
 
@@ -561,7 +580,7 @@ class ExecutionMapperWithMemOpCounting(ExecutionMapper):
         kwargs = {}
         kdescr = insn.kernel_descriptor
         for name, expr in six.iteritems(kdescr.input_mappings):
-            val = self.rec(expr)
+            val = self.inner_mapper.rec(expr)
             kwargs[name] = val
             assert not isinstance(val, np.ndarray)
             if profile_data is not None and isinstance(val, pyopencl.array.Array):
@@ -571,7 +590,7 @@ class ExecutionMapperWithMemOpCounting(ExecutionMapper):
                         profile_data.get("bytes_read_by_scalar_assignments", 0)
                         + val.nbytes)
 
-        discr = self.discrwb.discr_from_dd(kdescr.governing_dd)
+        discr = self.inner_mapper.discrwb.discr_from_dd(kdescr.governing_dd)
         for name in kdescr.scalar_args():
             v = kwargs[name]
             if isinstance(v, (int, float)):
@@ -603,28 +622,31 @@ class ExecutionMapperWithMemOpCounting(ExecutionMapper):
 
         for name, expr in zip(insn.names, insn.exprs):
             logger.debug("assignment not profiled: %s <- %s", name, expr)
-            value = self.rec(expr)
-            self.discrwb._discr_scoped_subexpr_name_to_value[name] = value
+            inner_mapper = self.inner_mapper
+            value = inner_mapper.rec(expr)
+            inner_mapper.discrwb._discr_scoped_subexpr_name_to_value[name] = value
             assignments.append((name, value))
 
         return assignments, []
 
     def map_insn_assign_from_discr_scoped(self, insn, profile_data=None):
-        return [(insn.name,
-            self.discrwb._discr_scoped_subexpr_name_to_value[insn.name])], []
+        return [(
+            insn.name,
+            self.inner_mapper.
+                discrwb._discr_scoped_subexpr_name_to_value[insn.name])], []
 
     def map_insn_rank_data_swap(self, insn, profile_data):
         raise NotImplementedError("no profiling for instruction: %s" % insn)
 
     def map_insn_diff_batch_assign(self, insn, profile_data):
-        assignments, futures = super().map_insn_diff_batch_assign(insn)
+        assignments, futures = self.inner_mapper.map_insn_diff_batch_assign(insn)
 
         if profile_data is not None:
             # We model the cost to load the input and write the output.  In
             # particular, we assume the elementwise matrices are negligible in
             # size and thus ignorable.
 
-            field = self.rec(insn.field)
+            field = self.inner_mapper.rec(insn.field)
             profile_data["bytes_read"] = (
                     profile_data.get("bytes_read", 0) + field.nbytes)
 
@@ -781,19 +803,19 @@ def time_insn(f):
     return wrapper
 
 
-class ExecutionMapperWithTiming(ExecutionMapper):
+class ExecutionMapperWithTiming(ExecutionMapperWrapper):
 
     def map_profiled_call(self, expr, profile_data):
-        args = [self.rec(p) for p in expr.parameters]
-        return self.function_registry[expr.function.name](
+        args = [self.inner_mapper.rec(p) for p in expr.parameters]
+        return self.inner_mapper.function_registry[expr.function.name](
                 self.queue, *args, profile_data=profile_data)
 
     def map_profiled_operator_binding(self, expr, profile_data):
         if profile_data is None:
-            return super().map_operator_binding(expr)
+            return self.inner_mapper.map_operator_binding(expr)
 
         start = cl.enqueue_marker(self.queue)
-        retval = super().map_operator_binding(expr)
+        retval = self.inner_mapper.map_operator_binding(expr)
         end = cl.enqueue_marker(self.queue)
         time_field_name = "time_op_%s" % expr.op.mapper_method
         profile_data\
@@ -802,9 +824,16 @@ class ExecutionMapperWithTiming(ExecutionMapper):
 
         return retval
 
+    def map_insn_assign_to_discr_scoped(self, insn, profile_data):
+        return self.inner_mapper.map_insn_assign_to_discr_scoped(insn, profile_data)
+
+    def map_insn_assign_from_discr_scoped(self, insn, profile_data):
+        return self.\
+            inner_mapper.map_insn_assign_from_discr_scoped(insn, profile_data)
+
     @time_insn
     def map_insn_loopy_kernel(self, *args, **kwargs):
-        return super().map_insn_loopy_kernel(*args, **kwargs)
+        return self.inner_mapper.map_insn_loopy_kernel(*args, **kwargs)
 
     def map_insn_assign(self, insn, profile_data):
         if len(insn.exprs) == 1:
@@ -817,11 +846,11 @@ class ExecutionMapperWithTiming(ExecutionMapper):
                 val = self.map_profiled_operator_binding(insn.exprs[0], profile_data)
                 return [(insn.names[0], val)], []
 
-        return super().map_insn_assign(insn, profile_data)
+        return self.inner_mapper.map_insn_assign(insn, profile_data)
 
     @time_insn
     def map_insn_diff_batch_assign(self, insn, profile_data):
-        return super().map_insn_diff_batch_assign(insn, profile_data)
+        return self.inner_mapper.map_insn_diff_batch_assign(insn, profile_data)
 
 # }}}
 
-- 
GitLab