diff --git a/examples/plot-connectivity.py b/examples/plot-connectivity.py
index 48e6be76b7033c4a3f04f55c6958c7efd856f0e0..23dfdcd1d2db6f06466007a173dcf8d843b3c571 100644
--- a/examples/plot-connectivity.py
+++ b/examples/plot-connectivity.py
@@ -2,7 +2,7 @@ from __future__ import division
 
 import numpy as np  # noqa
 import pyopencl as cl
-from meshmode.array_context import PyOpenCLArrayContext
+from meshmode.array_context import PyOpenCLArrayContext, PyOpenCLArrayContext
 from meshmode.dof_array import thaw
 
 order = 4
diff --git a/examples/simple-dg.py b/examples/simple-dg.py
index b945687758f172194d859560bc1723064d56971b..6adb73738989fd73329bef70b797c7d5f87c4fde 100644
--- a/examples/simple-dg.py
+++ b/examples/simple-dg.py
@@ -33,7 +33,7 @@ from pytools.obj_array import (
         obj_array_vectorize)
 from meshmode.mesh import BTAG_ALL, BTAG_NONE  # noqa
 from meshmode.dof_array import DOFArray, freeze, thaw
-from meshmode.array_context import PyOpenCLArrayContext, make_loopy_program
+from meshmode.array_context import PyOpenCLProfilingArrayContext, make_loopy_program
 
 
 # Features lost vs. https://github.com/inducer/grudge:
@@ -464,9 +464,9 @@ def bump(actx, discr, t=0):
 
 def main():
     cl_ctx = cl.create_some_context()
-    queue = cl.CommandQueue(cl_ctx)
+    queue = cl.CommandQueue(cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
 
-    actx = PyOpenCLArrayContext(queue)
+    actx = PyOpenCLProfilingArrayContext(queue)
 
     nel_1d = 16
     from meshmode.mesh.generation import generate_regular_rect_mesh
diff --git a/meshmode/array_context.py b/meshmode/array_context.py
index 26da731eaa4a6e44873bd66d5a514a18b1e2e8dc..ee3c686e3dd1c59b1bc612410d55781872b316f9 100644
--- a/meshmode/array_context.py
+++ b/meshmode/array_context.py
@@ -22,9 +22,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-
 import numpy as np
 import loopy as lp
+import pyopencl as cl
 from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 from pytools import memoize_method
 
@@ -262,4 +262,166 @@ class PyOpenCLArrayContext(ArrayContext):
 # }}}
 
 
+class ProfileData:
+    time = 0
+    flops = 0
+    mem_access = 0
+
+    def __init__(self, time=0, flops=0, mem_access=0):
+        self.time = time
+        self.flops = flops
+        self.mem_access = mem_access
+
+    def __repr__(self):
+        return "(time={0}, flops={1}, mem_access={2})".format(self.time, self.flops, self.mem_access)
+
+    def __str__(self):
+        return self.__repr__()
+
+
+class TimingEvent:
+    def __init__(self, event, program, kwargs):
+        self.event = event
+        self.program = program
+        self.kwargs = kwargs
+
+
+class PyOpenCLProfilingArrayContext(PyOpenCLArrayContext):
+
+    def __init__(self, queue, allocator=None):
+        super().__init__(queue, allocator)
+
+        if not queue.properties & cl.command_queue_properties.PROFILING_ENABLE:
+            from warnings import warn
+            warn("Profiling was not enabled in the command queue. Timing data will not be collected.")
+            self.profiling_enabled = False
+        else:
+            self.profiling_enabled = True
+
+        self.events = []
+        self.profiling_data = {}
+        self.invoker_codes = {}
+
+    def finish_profile_events(self):
+
+        if not self.profiling_enabled:
+            return
+
+        if self.events:
+            cl.wait_for_events([t.event for t in self.events])
+
+        for t in self.events:
+
+            kwargs = t.kwargs
+            program = t.program
+            invoker_code = self.invoker_codes[program.name][tuple(kwargs)]
+            evt = t.event
+
+            types = {}
+            param_dict = {}
+
+            for key, value in kwargs.items():
+                types[key] = value.dtype
+                param_dict[key] = value
+
+            # extract integer argument generation code from wrapper
+            code = ""
+            import textwrap
+            for o in ["shapes", "strides"]: #"offsets",
+                subs = "# {{{ find integer arguments from " + o
+                start=invoker_code.find(subs) + len(subs)
+                end=invoker_code.find("# }}}", start)
+                code = code + textwrap.dedent(invoker_code[start:end])
+
+
+            for key, value in program.arg_dict.items():
+                if key not in param_dict:
+                    param_dict[key] = None
+
+            # execute integer argument generation code from wrapper
+            exec(code, param_dict)
+
+            # get statistics
+            program = lp.add_and_infer_dtypes(program, types)
+            op_map = lp.get_op_map(program, subgroup_size='guess')
+            mem_map = lp.get_mem_access_map(program, subgroup_size='guess')
+
+            f32op_count = op_map.filter_by(dtype=[np.float32]).eval_and_sum(param_dict)
+            f64op_count = op_map.filter_by(dtype=[np.float64]).eval_and_sum(param_dict)
+
+            mem32_count = mem_map.filter_by(dtype=[np.float32]).eval_and_sum(param_dict)
+            mem64_count = mem_map.filter_by(dtype=[np.float64]).eval_and_sum(param_dict)
+
+            flops = f32op_count + f64op_count
+            time = evt.profile.end - evt.profile.start
+            mem_access = mem32_count + mem64_count
+
+            if program.name in self.profiling_data:
+                self.profiling_data[program.name].append(ProfileData(time, flops, mem_access))
+            else:
+                self.profiling_data[program.name] = [ProfileData(time, flops, mem_access)]
+
+        self.events = []
+
+    def print_profiling_data(self):
+
+        if not self.profiling_enabled:
+            return
+
+        self.finish_profile_events()
+
+        max_name_len = max([len(key) for key, value in self.profiling_data.items()])
+        max_name_len = max(max_name_len, len('Function'))
+
+        format_str = "{:<" + str(max_name_len) + "} {:>6} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8}"
+
+        print(format_str.format('='*20, '='*6, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8))
+        print(format_str.format('Function', 'Calls', 'T_min', 'T_avg', 'T_max', 'F_min', 'F_avg', 'F_max', 'M_min', 'M_avg', 'M_max', 'BW_avg'))
+        print(format_str.format('='*20, '='*6, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8))
+
+        from statistics import mean
+
+        for key, value in self.profiling_data.items():
+            num_values = len(value)
+
+            times = [v.time for v in value]
+            flops = [v.flops for v in value]
+            mem_access = [v.mem_access for v in value]
+
+            print(format_str.format(key, num_values, min(times), int(mean(times)), max(times), min(flops), int(mean(flops)), max(flops), min(mem_access), int(mean(mem_access)), max(mem_access), round(mean(mem_access)/mean(times),3) ))
+
+        print(format_str.format('='*20, '='*6, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8, '='*8))
+
+    def __del__(self):
+        self.print_profiling_data()
+
+    def call_loopy(self, program, **kwargs):
+
+        from warnings import resetwarnings, filterwarnings
+        resetwarnings()
+        filterwarnings('ignore', category=Warning)
+
+        program = self.transform_loopy_program(program)
+        assert program.options.return_dict
+        assert program.options.no_numpy
+
+        # Determine if we need to get the invoker code (for integer argument generation).
+        # N.B.: The invoker code might be different for the same program with different kwargs
+        if program.name not in self.invoker_codes or tuple(kwargs) not in self.invoker_codes[program.name]:
+            executor=program.target.get_kernel_executor(program, self.queue)
+            info = executor.kernel_info(executor.arg_to_dtype_set(kwargs))
+            invoker_code = info.invoker.get()
+            self.invoker_codes[program.name] = {}
+            self.invoker_codes[program.name][tuple(kwargs)] = invoker_code
+
+        evt, result = program(self.queue, **kwargs, allocator=self.allocator)
+
+        if self.profiling_enabled:
+            self.events.append(TimingEvent(evt, program, kwargs))
+
+        # self.print_profiling_data()
+
+        return result
+
+
 # vim: foldmethod=marker