diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst
index 386fbc18a2c5089486aba51395d671b4c9a600ff..fcd470dc2da4c836753c2b04b5fc5a1aa224b3db 100644
--- a/doc/ref_transform.rst
+++ b/doc/ref_transform.rst
@@ -96,6 +96,8 @@ Modifying Arguments
 
 .. autofunction:: rename_argument
 
+.. autofunction:: set_temporary_scope
+
 Creating Batches of Operations
 ------------------------------
 
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 9d525605b7a41c39400cd4d4beb7e64e38d766f2..b7be43f7e283bc464c88a6cdb49141964f9a8f1c 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1440,26 +1440,28 @@ elements in memory. The total number of array accesses has not changed:
     f64 load: 131072
     f64 store: 65536
 
-Counting barriers
-~~~~~~~~~~~~~~~~~
+Counting synchronization events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-:func:`loopy.get_barrier_poly` counts the number of barriers per **thread** in a
-kernel. First, we'll call this function on the kernel from the previous example:
+:func:`loopy.get_synchronization_poly` counts the number of synchronization
+events per **thread** in a kernel. First, we'll call this function on the
+kernel from the previous example:
 
 .. doctest::
 
-    >>> from loopy.statistics import get_barrier_poly
-    >>> barrier_poly = get_barrier_poly(knl)
-    >>> print("Barrier polynomial: %s" % barrier_poly)
-    Barrier polynomial: { 0 }
+    >>> from loopy.statistics import get_synchronization_poly
+    >>> barrier_poly = get_synchronization_poly(knl)
+    >>> print(lp.stringify_stats_mapping(barrier_poly))
+    kernel_launch : { 1 }
+    <BLANKLINE>
 
 We can evaluate this polynomial using :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
-    >>> barrier_count = barrier_poly.eval_with_dict(param_dict)
-    >>> print("Barrier count: %s" % barrier_count)
-    Barrier count: 0
+    >>> launch_count = barrier_poly["kernel_launch"].eval_with_dict(param_dict)
+    >>> print("Kernel launch count: %s" % launch_count)
+    Kernel launch count: 1
 
 Now to make things more interesting, we'll create a kernel with barriers:
 
@@ -1505,12 +1507,11 @@ using :func:`loopy.get_barrier_poly`:
 
 .. doctest::
 
-    >>> barrier_poly = get_barrier_poly(knl)
-    >>> barrier_count = barrier_poly.eval_with_dict({})
-    >>> print("Barrier polynomial: %s\nBarrier count: %i" %
-    ...     (barrier_poly, barrier_count))
-    Barrier polynomial: { 1000 }
-    Barrier count: 1000
+    >>> sync_map = lp.get_synchronization_poly(knl)
+    >>> print(lp.stringify_stats_mapping(sync_map))
+    barrier_local : { 1000 }
+    kernel_launch : { 1 }
+    <BLANKLINE>
 
 Based on the kernel code printed above, we would expect each thread to encounter
 50x10x2 barriers, which matches the result from :func:`loopy.get_barrier_poly`. In
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 8562df4118534392cf66ada250aacdfdbc0e1917..19d9ddbc72c6de90ad7cbfddc740d74a87b2d2a7 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -78,7 +78,8 @@ from loopy.transform.data import (
         add_prefetch, change_arg_to_image, tag_data_axes,
         set_array_dim_names, remove_unused_arguments,
         alias_temporaries, set_argument_order,
-        rename_argument)
+        rename_argument,
+        set_temporary_scope)
 
 from loopy.transform.subst import (extract_subst,
         assignment_to_subst, expand_subst, find_rules_matching,
@@ -107,7 +108,7 @@ from loopy.preprocess import (preprocess_kernel, realize_reduction,
 from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
 from loopy.statistics import (get_op_poly, sum_ops_to_dtypes,
         get_gmem_access_poly,
-        get_DRAM_access_poly, get_barrier_poly, stringify_stats_mapping,
+        get_DRAM_access_poly, get_synchronization_poly, stringify_stats_mapping,
         sum_mem_access_to_bytes,
         gather_access_footprints, gather_access_footprint_bytes)
 from loopy.codegen import (
@@ -166,7 +167,7 @@ __all__ = [
         "add_prefetch", "change_arg_to_image", "tag_data_axes",
         "set_array_dim_names", "remove_unused_arguments",
         "alias_temporaries", "set_argument_order",
-        "rename_argument",
+        "rename_argument", "set_temporary_scope",
 
         "find_instructions", "map_instructions",
         "set_instruction_priority", "add_dependency",
@@ -205,7 +206,8 @@ __all__ = [
 
         "get_op_poly", "sum_ops_to_dtypes", "get_gmem_access_poly",
         "get_DRAM_access_poly",
-        "get_barrier_poly", "stringify_stats_mapping", "sum_mem_access_to_bytes",
+        "get_synchronization_poly", "stringify_stats_mapping",
+        "sum_mem_access_to_bytes",
         "gather_access_footprints", "gather_access_footprint_bytes",
 
         "CompiledKernel",
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index 0adf4416d44e199f1b329205a3159736acfa3fa3..bada80328c337cc5e756afaf05b180f2b027b1ca 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -79,7 +79,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters):
     import pyopencl as cl
     import pyopencl.array as cl_array
 
-    from loopy.kernel.data import ValueArg, GlobalArg, ImageArg
+    from loopy.kernel.data import ValueArg, GlobalArg, ImageArg, TemporaryVariable
 
     from pymbolic import evaluate
 
@@ -177,6 +177,11 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters):
                         ref_alloc_size=alloc_size,
                         ref_numpy_strides=numpy_strides,
                         needs_checking=is_output))
+
+        elif arg.arg_class is TemporaryVariable:
+            # global temporary, handled by invocation logic
+            pass
+
         else:
             raise LoopyError("arg type not understood")
 
@@ -191,7 +196,7 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters):
     import pyopencl as cl
     import pyopencl.array as cl_array
 
-    from loopy.kernel.data import ValueArg, GlobalArg, ImageArg
+    from loopy.kernel.data import ValueArg, GlobalArg, ImageArg, TemporaryVariable
 
     from pymbolic import evaluate
 
@@ -275,6 +280,10 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters):
             arg_desc.test_numpy_strides = numpy_strides
             arg_desc.test_alloc_size = alloc_size
 
+        elif arg.arg_class is TemporaryVariable:
+            # global temporary, handled by invocation logic
+            pass
+
         else:
             raise LoopyError("arg type not understood")
 
diff --git a/loopy/check.py b/loopy/check.py
index 0ef3d27cfeea142e905dbb38aaaad9afbe06ebcf..910ab24ab3f98a9b01953d30354eda077273423d 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -127,9 +127,35 @@ def check_for_inactive_iname_access(kernel):
                     % insn.id)
 
 
+def _is_racing_iname_tag(tv, tag):
+    from loopy.kernel.data import (temp_var_scope,
+            LocalIndexTagBase, GroupIndexTag, ParallelTag, auto)
+
+    if tv.scope == temp_var_scope.PRIVATE:
+        return (
+                isinstance(tag, ParallelTag)
+                and not isinstance(tag, (LocalIndexTagBase, GroupIndexTag)))
+
+    elif tv.scope == temp_var_scope.LOCAL:
+        return (
+                isinstance(tag, ParallelTag)
+                and not isinstance(tag, GroupIndexTag))
+
+    elif tv.scope == temp_var_scope.GLOBAL:
+        return isinstance(tag, ParallelTag)
+
+    elif tv.scope == auto:
+        raise LoopyError("scope of temp var '%s' has not yet been"
+                "determined" % tv.name)
+
+    else:
+        raise ValueError("unexpected value of temp_var.scope for "
+                "temporary variable '%s'" % tv.name)
+
+
 def check_for_write_races(kernel):
     from loopy.symbolic import DependencyMapper
-    from loopy.kernel.data import ParallelTag, GroupIndexTag, LocalIndexTagBase
+    from loopy.kernel.data import ParallelTag
     depmap = DependencyMapper(composite_leaves=False)
 
     iname_to_tag = kernel.iname_to_tag.get
@@ -162,26 +188,10 @@ def check_for_write_races(kernel):
 
             elif assignee_name in kernel.temporary_variables:
                 temp_var = kernel.temporary_variables[assignee_name]
-                if temp_var.is_local is True:
-                    raceable_parallel_insn_inames = set(
-                            iname
-                            for iname in kernel.insn_inames(insn)
-                            if isinstance(iname_to_tag(iname), ParallelTag)
-                            and not isinstance(iname_to_tag(iname), GroupIndexTag))
-
-                elif temp_var.is_local is False:
-                    raceable_parallel_insn_inames = set(
+                raceable_parallel_insn_inames = set(
                             iname
                             for iname in kernel.insn_inames(insn)
-                            if isinstance(iname_to_tag(iname), ParallelTag)
-                            and not isinstance(iname_to_tag(iname),
-                                GroupIndexTag)
-                            and not isinstance(iname_to_tag(iname),
-                                LocalIndexTagBase))
-
-                else:
-                    raise LoopyError("temp var '%s' hasn't decided on "
-                            "whether it is local" % temp_var.name)
+                            if _is_racing_iname_tag(temp_var, iname_to_tag(iname)))
 
             else:
                 raise LoopyError("invalid assignee name in instruction '%s'"
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 6eef793c7cf8294e7f8fad11a0757d59898d7ac7..5e3d11396f7a2e5720607700eca1a481c17005e2 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -407,7 +407,7 @@ def generate_code_v2(kernel):
 
     # {{{ examine arg list
 
-    from loopy.kernel.data import ValueArg
+    from loopy.kernel.data import ValueArg, temp_var_scope
     from loopy.kernel.array import ArrayBase
 
     implemented_data_info = []
@@ -432,6 +432,13 @@ def generate_code_v2(kernel):
         else:
             raise ValueError("argument type not understood: '%s'" % type(arg))
 
+    for tv in six.itervalues(kernel.temporary_variables):
+        if tv.scope == temp_var_scope.GLOBAL:
+            implemented_data_info.extend(
+                    tv.decl_info(
+                        kernel.target,
+                        index_dtype=kernel.index_dtype))
+
     allow_complex = False
     for var in kernel.args + list(six.itervalues(kernel.temporary_variables)):
         if var.dtype.involves_complex():
diff --git a/loopy/compiled.py b/loopy/compiled.py
index 55feff66a83cbca5dacfb6717e7cb94fc69ed51d..900ed2ba3e06207cf1edd65b614f07c9854e5044 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -302,6 +302,7 @@ def generate_integer_arg_finding_from_strides(gen, kernel, implemented_data_info
 def generate_arg_setup(gen, kernel, implemented_data_info, options):
     import loopy as lp
 
+    from loopy.kernel.data import KernelArgument
     from loopy.kernel.array import ArrayBase
     from loopy.symbolic import StringifyMapper
     from pymbolic import var
@@ -318,10 +319,20 @@ def generate_arg_setup(gen, kernel, implemented_data_info, options):
 
     strify = StringifyMapper()
 
+    expect_no_more_arguments = False
+
     for arg_idx, arg in enumerate(implemented_data_info):
         is_written = arg.base_name in kernel.get_written_variables()
         kernel_arg = kernel.impl_arg_to_arg.get(arg.name)
 
+        if not issubclass(arg.arg_class, KernelArgument):
+            expect_no_more_arguments = True
+            continue
+
+        if expect_no_more_arguments:
+            raise LoopyError("Further arguments encountered after arg info "
+                    "describing a global temporary variable")
+
         if not issubclass(arg.arg_class, ArrayBase):
             args.append(arg.name)
             continue
@@ -552,9 +563,14 @@ def generate_invoker(kernel, codegen_result):
             "out_host=None"
             ]
 
+    from loopy.kernel.data import KernelArgument
     gen = PythonFunctionGenerator(
             "invoke_%s_loopy_kernel" % kernel.name,
-            system_args + ["%s=None" % iai.name for iai in implemented_data_info])
+            system_args + [
+                "%s=None" % idi.name
+                for idi in implemented_data_info
+                if issubclass(idi.arg_class, KernelArgument)
+                ])
 
     gen.add_to_preamble("from __future__ import division")
     gen.add_to_preamble("")
@@ -600,7 +616,10 @@ def generate_invoker(kernel, codegen_result):
         gen("if out_host:")
         with Indentation(gen):
             gen("pass")  # if no outputs (?!)
-            for arg_idx, arg in enumerate(implemented_data_info):
+            for arg in implemented_data_info:
+                if not issubclass(arg.arg_class, KernelArgument):
+                    continue
+
                 is_written = arg.base_name in kernel.get_written_variables()
                 if is_written:
                     gen("%s = %s.get(queue=queue)" % (arg.name, arg.name))
@@ -611,10 +630,12 @@ def generate_invoker(kernel, codegen_result):
         gen("return _lpy_evt, {%s}"
                 % ", ".join("\"%s\": %s" % (arg.name, arg.name)
                     for arg in implemented_data_info
+                    if issubclass(arg.arg_class, KernelArgument)
                     if arg.base_name in kernel.get_written_variables()))
     else:
         out_args = [arg
                 for arg in implemented_data_info
+                    if issubclass(arg.arg_class, KernelArgument)
                 if arg.base_name in kernel.get_written_variables()]
         if out_args:
             gen("return _lpy_evt, (%s,)"
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 9b2c896ac2e8defc20bb54ea5c0682b68a363263..5ac63b56eb6347d93e65a3fa860840c37cabe146 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -843,9 +843,17 @@ class LoopKernel(RecordWithoutPickling):
 
     @memoize_method
     def global_var_names(self):
+        from loopy.kernel.data import temp_var_scope
+
         from loopy.kernel.data import GlobalArg
-        return set(arg.name for arg in self.args
-            if isinstance(arg, GlobalArg))
+        return (
+                set(
+                    arg.name for arg in self.args
+                    if isinstance(arg, GlobalArg))
+                | set(
+                    tv.name
+                    for tv in six.itervalues(self.temporary_variables)
+                    if tv.scope == temp_var_scope.GLOBAL))
 
     # }}}
 
@@ -1033,14 +1041,17 @@ class LoopKernel(RecordWithoutPickling):
 
     @memoize_method
     def local_var_names(self):
+        from loopy.kernel.data import temp_var_scope
         return set(
             tv.name
             for tv in six.itervalues(self.temporary_variables)
-            if tv.is_local)
+            if tv.scope == temp_var_scope.LOCAL)
 
     def local_mem_use(self):
-        return sum(lv.nbytes for lv in six.itervalues(self.temporary_variables)
-                if lv.is_local)
+        from loopy.kernel.data import temp_var_scope
+        return sum(
+                tv.nbytes for tv in six.itervalues(self.temporary_variables)
+                if tv.scope == temp_var_scope.LOCAL)
 
     # }}}
 
@@ -1213,8 +1224,8 @@ class LoopKernel(RecordWithoutPickling):
         return CompiledKernel(ctx, self)
 
     def __call__(self, queue, **kwargs):
-        return self.get_compiled_kernel(queue.context)(
-                queue, **kwargs)
+        cknl = self.get_compiled_kernel(queue.context)
+        return cknl(queue, **kwargs)
 
     # }}}
 
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 9fe0f5b7904c47b26b8d0b1b4c4ea5ce039da713..e25320729df3d5b4cceda46c9c406eb80e13ebdf 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -793,7 +793,7 @@ def expand_cses(instructions, cse_prefix="cse_expr"):
         new_temp_vars.append(TemporaryVariable(
                 name=new_var_name,
                 dtype=dtype,
-                is_local=lp.auto,
+                scope=lp.auto,
                 shape=()))
 
         from pymbolic.primitives import Variable
@@ -857,7 +857,7 @@ def create_temporaries(knl, default_order):
                 new_temp_vars[assignee_name] = lp.TemporaryVariable(
                         name=assignee_name,
                         dtype=temp_var_type,
-                        is_local=lp.auto,
+                        scope=lp.auto,
                         base_indices=lp.auto,
                         shape=lp.auto,
                         order=default_order)
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 8de5919df44a6201a4cfb123fa1a34516132634e..9c399997de57d1d57f2765b7e1c5d842dec89c55 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -275,19 +275,34 @@ class temp_var_scope:
     .. attribute:: GLOBAL
     """
 
+    # These must occur in ascending order of 'globality' so that
+    # max(scope) does the right thing.
+
     PRIVATE = 0
     LOCAL = 1
     GLOBAL = 2
 
+    @classmethod
+    def stringify(cls, val):
+        if val == cls.PRIVATE:
+            return "private"
+        elif val == cls.LOCAL:
+            return "local"
+        elif val == cls.GLOBAL:
+            return "global"
+        else:
+            raise ValueError("unexpected value of temp_var_scope")
+
 
 class TemporaryVariable(ArrayBase):
     __doc__ = ArrayBase.__doc__ + """
     .. attribute:: storage_shape
     .. attribute:: base_indices
-    .. attribute:: is_local
+    .. attribute:: scope
 
-        Whether this is temporary lives in ``local`` memory.
-        May be *True*, *False*, or :class:`loopy.auto` if this is
+        What memory this temporary variable lives in.
+        One of the values in :class:`temp_var_scope`,
+        or :class:`loopy.auto` if this is
         to be automatically determined.
 
     .. attribute:: base_storage
@@ -304,11 +319,11 @@ class TemporaryVariable(ArrayBase):
     allowed_extra_kwargs = [
             "storage_shape",
             "base_indices",
-            "is_local",
+            "scope",
             "base_storage"
             ]
 
-    def __init__(self, name, dtype=None, shape=(), is_local=auto,
+    def __init__(self, name, dtype=None, shape=(), scope=auto,
             dim_tags=None, offset=0, dim_names=None, strides=None, order=None,
             base_indices=None, storage_shape=None,
             base_storage=None):
@@ -318,10 +333,6 @@ class TemporaryVariable(ArrayBase):
         :arg base_indices: :class:`loopy.auto` or a tuple of base indices
         """
 
-        if is_local is None:
-            raise ValueError("is_local is None is no longer supported. "
-                    "Use loopy.auto.")
-
         if base_indices is None:
             base_indices = (0,) * len(shape)
 
@@ -329,18 +340,25 @@ class TemporaryVariable(ArrayBase):
                 dtype=dtype, shape=shape,
                 dim_tags=dim_tags, offset=offset, dim_names=dim_names,
                 order="C",
-                base_indices=base_indices, is_local=is_local,
+                base_indices=base_indices, scope=scope,
                 storage_shape=storage_shape,
                 base_storage=base_storage)
 
     @property
-    def scope(self):
+    def is_local(self):
         """One of :class:`loopy.temp_var_scope`."""
 
-        if self.is_local:
-            return temp_var_scope.LOCAL
+        if self.scope is auto:
+            return auto
+        elif self.scope == temp_var_scope.LOCAL:
+            return True
+        elif self.scope == temp_var_scope.PRIVATE:
+            return False
+        elif self.scope == temp_var_scope.GLOBAL:
+            raise LoopyError("TemporaryVariable.is_local called on "
+                    "global temporary variable '%s'" % self.name)
         else:
-            return temp_var_scope.PRIVATE
+            raise LoopyError("unexpected value of TemporaryVariable.scope")
 
     @property
     def nbytes(self):
@@ -356,18 +374,31 @@ class TemporaryVariable(ArrayBase):
                 target, is_written=True, index_dtype=index_dtype,
                 shape_override=self.storage_shape)
 
-    def get_arg_decl(self, target, name_suffix, shape, dtype, is_written):
-        return None
+    def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written):
+        if self.scope == temp_var_scope.GLOBAL:
+            return ast_builder.get_global_arg_decl(self.name + name_suffix, shape,
+                    dtype, is_written)
+        else:
+            raise LoopyError("unexpected request for argument declaration of "
+                    "non-global temporary")
 
     def __str__(self):
-        return self.stringify(include_typename=False)
+        if self.scope is auto:
+            scope_str = "auto"
+        else:
+            scope_str = temp_var_scope.stringify(self.scope)
+
+        return (
+                self.stringify(include_typename=False)
+                +
+                " scope:%s" % scope_str)
 
     def __eq__(self, other):
         return (
                 super(TemporaryVariable, self).__eq__(other)
                 and self.storage_shape == other.storage_shape
                 and self.base_indices == other.base_indices
-                and self.is_local == other.is_local
+                and self.scope == other.scope
                 and self.base_storage == other.base_storage)
 
     def update_persistent_hash(self, key_hash, key_builder):
@@ -378,7 +409,7 @@ class TemporaryVariable(ArrayBase):
         super(TemporaryVariable, self).update_persistent_hash(key_hash, key_builder)
         key_builder.rec(key_hash, self.storage_shape)
         key_builder.rec(key_hash, self.base_indices)
-        key_builder.rec(key_hash, self.is_local)
+        key_builder.rec(key_hash, self.scope)
 
 # }}}
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 2fdadb48e790d2cd05cc00f4d04c31957505aaa9..8b7be4f0407bb0b9e52f0b6915841af1c5dd7ac5 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -293,30 +293,46 @@ def infer_unknown_types(kernel, expect_completion=False):
 # }}}
 
 
-# {{{ decide which temporaries are local
+# {{{ decide temporary scope
 
-def mark_local_temporaries(kernel):
+def _get_compute_inames_tagged(kernel, insn, tag_base):
+    return set(iname
+            for iname in kernel.insn_inames(insn.id)
+            if isinstance(kernel.iname_to_tag.get(iname), tag_base))
+
+
+def _get_assignee_inames_tagged(kernel, insn, tag_base, tv_name):
+    from loopy.symbolic import get_dependencies
+
+    return set(iname
+            for aname, aindices in insn.assignees_and_indices()
+            for iname in get_dependencies(aindices)
+                & kernel.all_inames()
+            if aname == tv_name
+            if isinstance(kernel.iname_to_tag.get(iname), tag_base))
+
+
+def find_temporary_scope(kernel):
     logger.debug("%s: mark local temporaries" % kernel.name)
 
     new_temp_vars = {}
-    from loopy.kernel.data import LocalIndexTagBase
+    from loopy.kernel.data import (LocalIndexTagBase, GroupIndexTag,
+            temp_var_scope)
     import loopy as lp
 
     writers = kernel.writer_map()
 
-    from loopy.symbolic import get_dependencies
-
     for temp_var in six.itervalues(kernel.temporary_variables):
         # Only fill out for variables that do not yet know if they're
         # local. (I.e. those generated by implicit temporary generation.)
 
-        if temp_var.is_local is not lp.auto:
+        if temp_var.scope is not lp.auto:
             new_temp_vars[temp_var.name] = temp_var
             continue
 
         my_writers = writers.get(temp_var.name, [])
 
-        wants_to_be_local_per_insn = []
+        desired_scope_per_insn = []
         for insn_id in my_writers:
             insn = kernel.id_to_insn[insn_id]
 
@@ -327,54 +343,66 @@ def mark_local_temporaries(kernel):
             # - the instruction is run across more inames (locally) parallel
             #   than are reflected in the assignee indices.
 
-            locparallel_compute_inames = set(iname
-                    for iname in kernel.insn_inames(insn_id)
-                    if isinstance(kernel.iname_to_tag.get(iname), LocalIndexTagBase))
+            locparallel_compute_inames = _get_compute_inames_tagged(
+                    kernel, insn, LocalIndexTagBase)
+
+            locparallel_assignee_inames = _get_assignee_inames_tagged(
+                    kernel, insn, LocalIndexTagBase, temp_var.name)
 
-            locparallel_assignee_inames = set(iname
-                    for aname, aindices in insn.assignees_and_indices()
-                    for iname in get_dependencies(aindices)
-                        & kernel.all_inames()
-                    if aname == temp_var.name
-                    if isinstance(kernel.iname_to_tag.get(iname), LocalIndexTagBase))
+            grpparallel_compute_inames = _get_compute_inames_tagged(
+                    kernel, insn, GroupIndexTag)
+
+            grpparallel_assignee_inames = _get_assignee_inames_tagged(
+                    kernel, insn, GroupIndexTag, temp_var.name)
 
             assert locparallel_assignee_inames <= locparallel_compute_inames
+            assert grpparallel_assignee_inames <= grpparallel_compute_inames
+
+            desired_scope = temp_var_scope.PRIVATE
+            for iname_descr, scope_descr, apin, cpin, scope in [
+                    ("local", "local", locparallel_assignee_inames,
+                        locparallel_compute_inames, temp_var_scope.LOCAL),
+                    ("group", "global", grpparallel_assignee_inames,
+                        grpparallel_compute_inames, temp_var_scope.GLOBAL),
+                    ]:
+
+                if (apin != cpin and bool(locparallel_assignee_inames)):
+                    warn(kernel, "write_race_local(%s)" % insn_id,
+                            "instruction '%s' looks invalid: "
+                            "it assigns to indices based on %s IDs, but "
+                            "its temporary '%s' cannot be made %s because "
+                            "a write race across the iname(s) '%s' would emerge. "
+                            "(Do you need to add an extra iname to your prefetch?)"
+                            % (insn_id, iname_descr, temp_var.name, scope_descr,
+                                ", ".join(cpin - apin)),
+                            WriteRaceConditionWarning)
+
+                if (apin == cpin
+
+                        # doesn't want to be in this scope if there aren't any
+                        # parallel inames of that kind:
+                        and bool(cpin)):
+                    desired_scope = max(desired_scope, scope)
+                    break
+
+            desired_scope_per_insn.append(desired_scope)
 
-            if (locparallel_assignee_inames != locparallel_compute_inames
-                    and bool(locparallel_assignee_inames)):
-                warn(kernel, "write_race_local(%s)" % insn_id,
-                        "instruction '%s' looks invalid: "
-                        "it assigns to indices based on local IDs, but "
-                        "its temporary '%s' cannot be made local because "
-                        "a write race across the iname(s) '%s' would emerge. "
-                        "(Do you need to add an extra iname to your prefetch?)"
-                        % (insn_id, temp_var.name, ", ".join(
-                            locparallel_compute_inames
-                            - locparallel_assignee_inames)),
-                        WriteRaceConditionWarning)
-
-            wants_to_be_local_per_insn.append(
-                    locparallel_assignee_inames == locparallel_compute_inames
-
-                    # doesn't want to be local if there aren't any
-                    # parallel inames:
-                    and bool(locparallel_compute_inames))
-
-        if not wants_to_be_local_per_insn:
+        if not desired_scope_per_insn:
             warn(kernel, "temp_to_write(%s)" % temp_var.name,
                     "temporary variable '%s' never written, eliminating"
                     % temp_var.name, LoopyAdvisory)
 
             continue
 
-        is_local = any(wants_to_be_local_per_insn)
+        overall_scope = max(desired_scope_per_insn)
 
         from pytools import all
-        if not all(wtbl == is_local for wtbl in wants_to_be_local_per_insn):
-            raise LoopyError("not all instructions agree on whether "
-                    "temporary '%s' should be in local memory" % temp_var.name)
+        if not all(iscope == overall_scope for iscope in desired_scope_per_insn):
+            raise LoopyError("not all instructions agree on the "
+                    "the desired scope (private/local/global) of  the "
+                    "temporary '%s'" % temp_var.name)
 
-        new_temp_vars[temp_var.name] = temp_var.copy(is_local=is_local)
+        new_temp_vars[temp_var.name] = temp_var.copy(scope=overall_scope)
 
     return kernel.copy(temporary_variables=new_temp_vars)
 
@@ -486,14 +514,14 @@ def realize_reduction(kernel, insn_id_filter=None):
                 for i in range(ncomp)]
         acc_vars = tuple(var(n) for n in acc_var_names)
 
-        from loopy.kernel.data import TemporaryVariable
+        from loopy.kernel.data import TemporaryVariable, temp_var_scope
 
         for name, dtype in zip(acc_var_names, reduction_dtypes):
             new_temporary_variables[name] = TemporaryVariable(
                     name=name,
                     shape=(),
                     dtype=dtype,
-                    is_local=False)
+                    scope=temp_var_scope.PRIVATE)
 
         outer_insn_inames = temp_kernel.insn_inames(insn)
         bad_inames = frozenset(expr.inames) & outer_insn_inames
@@ -808,7 +836,7 @@ def preprocess_kernel(kernel, device=None):
     from loopy.transform.ilp import add_axes_to_temporaries_for_ilp_and_vec
     kernel = add_axes_to_temporaries_for_ilp_and_vec(kernel)
 
-    kernel = mark_local_temporaries(kernel)
+    kernel = find_temporary_scope(kernel)
     kernel = find_boostability(kernel)
     kernel = limit_boostability(kernel)
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 1a044b8d7f389fbb8e58d3fa597932f8c56ac97f..8a31f67fbf85bf8a370f17d2775569697f247502 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -45,7 +45,7 @@ __doc__ = """
 
 .. autofunction:: sum_mem_access_to_bytes
 
-.. autofunction:: get_barrier_poly
+.. autofunction:: get_synchronization_poly
 
 .. autofunction:: gather_access_footprints
 .. autofunction:: gather_access_footprint_bytes
@@ -793,16 +793,21 @@ def sum_mem_access_to_bytes(m):
 # }}}
 
 
-# {{{ get_barrier_poly
+# {{{ get_synchronization_poly
 
-def get_barrier_poly(knl):
+def get_synchronization_poly(knl):
 
-    """Count the number of barriers each thread encounters in a loopy kernel.
+    """Count the number of synchronization events each thread encounters in a
+    loopy kernel.
 
     :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.
 
-    :return: An :class:`islpy.PwQPolynomial` holding the number of barrier calls
-             made (in terms of the :class:`loopy.LoopKernel` *inames*).
+    :return: A dictionary mapping each type of synchronization event to a
+            :class:`islpy.PwQPolynomial` holding the number of such events
+            per thread.
+
+            Possible keys include ``barrier_local``, ``barrier_global``
+            (if supported by the target) and ``kernel_launch``.
 
     Example usage::
 
@@ -817,13 +822,27 @@ def get_barrier_poly(knl):
     """
 
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
-    from loopy.schedule import EnterLoop, LeaveLoop, Barrier
+    from loopy.schedule import (EnterLoop, LeaveLoop, Barrier,
+            CallKernel, ReturnFromKernel, RunInstruction)
     from operator import mul
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
     knl = lp.get_one_scheduled_kernel(knl)
     iname_list = []
-    barrier_poly = isl.PwQPolynomial('{ 0 }')
+
+    result = ToCountMap()
+
+    one = isl.PwQPolynomial('{ 1 }')
+
+    def get_count_poly(iname_list):
+        if iname_list:  # (if iname_list is not empty)
+            ct = (count(knl, (
+                            knl.get_inames_domain(iname_list).
+                            project_out_except(iname_list, [dim_type.set])
+                            )), )
+            return reduce(mul, ct)
+        else:
+            return one
 
     for sched_item in knl.schedule:
         if isinstance(sched_item, EnterLoop):
@@ -832,17 +851,23 @@ def get_barrier_poly(knl):
         elif isinstance(sched_item, LeaveLoop):
             if sched_item.iname:  # (if not empty)
                 iname_list.pop()
+
         elif isinstance(sched_item, Barrier):
-            if iname_list:  # (if iname_list is not empty)
-                ct = (count(knl, (
-                                knl.get_inames_domain(iname_list).
-                                project_out_except(iname_list, [dim_type.set])
-                                )), )
-                barrier_poly += reduce(mul, ct)
-            else:
-                barrier_poly += isl.PwQPolynomial('{ 1 }')
+            result = result + ToCountMap(
+                    {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)})
+
+        elif isinstance(sched_item, CallKernel):
+            result = result + ToCountMap(
+                    {"kernel_launch": get_count_poly(iname_list)})
+
+        elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)):
+            pass
+
+        else:
+            raise LoopyError("unexpected schedule item: %s"
+                    % type(sched_item).__name__)
 
-    return barrier_poly
+    return result.dict
 
 # }}}
 
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 493cef0634ea455f6a48c66e4cfbcb0da9b46572..6aca830d99c5637fc92e96f361c3c8bef5d65229 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -210,15 +210,15 @@ class CASTBuilder(ASTBuilderBase):
             return Const(POD(self, idi.dtype, idi.name))
         else:
             name = idi.base_name or idi.name
-            arg = kernel.arg_dict[name]
+            var_descr = kernel.get_var_descriptor(name)
             from loopy.kernel.data import ArrayBase
-            if isinstance(arg, ArrayBase):
-                return arg.get_arg_decl(
+            if isinstance(var_descr, ArrayBase):
+                return var_descr.get_arg_decl(
                         self,
                         idi.name[len(name):], idi.shape, idi.dtype,
                         idi.is_written)
             else:
-                return arg.get_arg_decl(self)
+                return var_descr.get_arg_decl(self)
 
     def get_function_declaration(self, codegen_state, codegen_result,
             schedule_index):
@@ -234,6 +234,8 @@ class CASTBuilder(ASTBuilderBase):
                             for idi in codegen_state.implemented_data_info])
 
     def get_temporary_decls(self, codegen_state):
+        from loopy.kernel.data import temp_var_scope
+
         kernel = codegen_state.kernel
 
         base_storage_decls = []
@@ -254,9 +256,12 @@ class CASTBuilder(ASTBuilderBase):
 
             if not tv.base_storage:
                 for idi in decl_info:
-                    temp_decls.append(
-                            self.wrap_temporary_decl(
-                                self.get_temporary_decl(kernel, tv, idi), tv.scope))
+                    # global temp vars are mapped to arguments
+                    if tv.scope != temp_var_scope.GLOBAL:
+                        temp_decls.append(
+                                self.wrap_temporary_decl(
+                                    self.get_temporary_decl(
+                                        kernel, tv, idi), tv.scope))
 
             else:
                 offset = 0
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index 779abc02e7d3aa86149128a686d45c4aada8e2b0..72147daf80fdfdb10e8e62d3b2163879a325e962 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -52,9 +52,11 @@ def adjust_local_temp_var_storage(kernel, device):
 
     new_temp_vars = {}
 
+    from loopy.kernel.data import temp_var_scope
+
     lmem_size = cl_char.usable_local_mem_size(device)
     for temp_var in six.itervalues(kernel.temporary_variables):
-        if not temp_var.is_local:
+        if temp_var.scope != temp_var_scope.LOCAL:
             new_temp_vars[temp_var.name] = \
                     temp_var.copy(storage_shape=temp_var.shape)
             continue
@@ -62,7 +64,8 @@ def adjust_local_temp_var_storage(kernel, device):
         other_loctemp_nbytes = [
                 tv.nbytes
                 for tv in six.itervalues(kernel.temporary_variables)
-                if tv.is_local and tv.name != temp_var.name]
+                if tv.scope == temp_var_scope.LOCAL
+                and tv.name != temp_var.name]
 
         storage_shape = temp_var.storage_shape
 
@@ -450,7 +453,8 @@ def generate_value_arg_setup(kernel, devices, implemented_data_info):
                         'must be supplied")'.format(name=idi.name))))
 
         if idi.dtype.is_integral():
-            gen(Comment("cast to Python int to avoid trouble with struct packing or Boost.Python"))
+            gen(Comment("cast to Python int to avoid trouble "
+                "with struct packing or Boost.Python"))
             if sys.version_info < (3,):
                 py_type = "long"
             else:
@@ -567,20 +571,58 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
 
     def get_function_definition(self, codegen_state, codegen_result,
             schedule_index, function_decl, function_body):
+        from loopy.kernel.data import TemporaryVariable
         args = (
                 ["_lpy_cl_kernels", "queue"]
-                + [idi.name for idi in codegen_state.implemented_data_info]
-                + ["wait_for=None"])
+                + [idi.name for idi in codegen_state.implemented_data_info
+                    if not issubclass(idi.arg_class, TemporaryVariable)]
+                + ["wait_for=None", "allocator=None"])
+
+        ecm = self.get_expression_to_code_mapper(codegen_state)
+
+        def alloc_nbytes(idi):
+            return idi.dtype.numpy_dtype.itemsize * (
+                    sum(astrd*(alen-1)
+                        for alen, astrd in zip(idi.unvec_shape, idi.unvec_strides))
+                    + 1)
 
-        from genpy import Function, Suite, ImportAs, Return, FromImport, Line
+        from genpy import (Function, Suite, Import, ImportAs, Return, FromImport,
+                If, Assign, Line, Statement as S)
+        from pymbolic.mapper.stringifier import PREC_NONE
         return Function(
                 codegen_result.current_program(codegen_state).name,
                 args,
                 Suite([
                     FromImport("struct", ["pack as _lpy_pack"]),
                     ImportAs("pyopencl", "_lpy_cl"),
+                    Import("pyopencl.tools"),
+                    Line(),
+                    If("allocator is None",
+                        Assign(
+                            "allocator",
+                            "_lpy_cl_tools.DeferredAllocator(queue.context)")),
+                    Line(),
+                    ] + [
+
+                    # allocate global temporaries
+                    Assign(idi.name, "allocator(%s)"
+                        % ecm(alloc_nbytes(idi), PREC_NONE, "i"))
+                    for idi in codegen_result.implemented_data_info
+                    if issubclass(idi.arg_class, TemporaryVariable)
+
+                    ] + [
                     Line(),
                     function_body,
+                    Line(),
+                    ] + [
+
+                    # free global temporaries
+                    S("%s.release()" % idi.name)
+                    for idi in codegen_result.implemented_data_info
+                    if issubclass(idi.arg_class, TemporaryVariable)
+
+                    ] + [
+                    Line(),
                     Return("_lpy_evt"),
                     ]))
 
diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py
index 002d5986a6f81a68a1c060937a2ebb7ab4821157..a7d22b2d0f7e376086b3a42ac2f186e2805fda26 100644
--- a/loopy/transform/buffer.py
+++ b/loopy/transform/buffer.py
@@ -32,6 +32,7 @@ from pymbolic.mapper.substitutor import make_subst_func
 from pytools.persistent_dict import PersistentDict
 from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper
 from loopy.version import DATA_MODEL_VERSION
+from loopy.diagnostic import LoopyError
 
 from pymbolic import var
 
@@ -130,7 +131,8 @@ buffer_array_cache = PersistentDict("loopy-buffer-array-cache-"+DATA_MODEL_VERSI
 # Adding an argument? also add something to the cache_key below.
 def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
         store_expression=None, within=None, default_tag="l.auto",
-        temporary_is_local=None, fetch_bounding_box=False):
+        temporary_scope=None, temporary_is_local=None,
+        fetch_bounding_box=False):
     """
     :arg init_expression: Either *None* (indicating the prior value of the buffered
         array should be read) or an expression optionally involving the
@@ -143,6 +145,27 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
         at all.)
     """
 
+    # {{{ unify temporary_scope / temporary_is_local
+
+    from loopy.kernel.data import temp_var_scope
+    if temporary_is_local is not None:
+        from warnings import warn
+        warn("temporary_is_local is deprecated. Use temporary_scope instead",
+                DeprecationWarning, stacklevel=2)
+
+        if temporary_scope is not None:
+            raise LoopyError("may not specify both temporary_is_local and "
+                    "temporary_scope")
+
+        if temporary_is_local:
+            temporary_scope = temp_var_scope.LOCAL
+        else:
+            temporary_scope = temp_var_scope.PRIVATE
+
+    del temporary_is_local
+
+    # }}}
+
     # {{{ process arguments
 
     if isinstance(init_expression, str):
@@ -181,9 +204,9 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
     else:
         var_shape = ()
 
-    if temporary_is_local is None:
+    if temporary_scope is None:
         import loopy as lp
-        temporary_is_local = lp.auto
+        temporary_scope = lp.auto
 
     # }}}
 
@@ -196,7 +219,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
     cache_key = (key_kernel, var_name, tuple(buffer_inames),
             PymbolicExpressionHashWrapper(init_expression),
             PymbolicExpressionHashWrapper(store_expression), within,
-            default_tag, temporary_is_local, fetch_bounding_box)
+            default_tag, temporary_scope, fetch_bounding_box)
 
     if CACHING_ENABLED:
         try:
@@ -312,7 +335,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
             dtype=var_descr.dtype,
             base_indices=(0,)*len(abm.non1_storage_shape),
             shape=tuple(abm.non1_storage_shape),
-            is_local=temporary_is_local)
+            scope=temporary_scope)
 
     new_temporary_variables[buf_var_name] = temp_var
 
diff --git a/loopy/transform/data.py b/loopy/transform/data.py
index 02499ded2c912f4f0bbbdcfbfc4a3d3d161414b3..3db96712eab21d9ad4efed6fc0a3dde08d1b9f65 100644
--- a/loopy/transform/data.py
+++ b/loopy/transform/data.py
@@ -136,7 +136,8 @@ def _process_footprint_subscripts(kernel, rule_name, sweep_inames,
 
 def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
         default_tag="l.auto", rule_name=None,
-        temporary_name=None, temporary_is_local=None,
+        temporary_name=None,
+        temporary_scope=None, temporary_is_local=None,
         footprint_subscripts=None,
         fetch_bounding_box=False):
     """Prefetch all accesses to the variable *var_name*, with all accesses
@@ -245,7 +246,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
             default_tag=default_tag, dtype=arg.dtype,
             fetch_bounding_box=fetch_bounding_box,
             temporary_name=temporary_name,
-            temporary_is_local=temporary_is_local)
+            temporary_scope=temporary_scope, temporary_is_local=temporary_is_local)
 
     # {{{ remove inames that were temporarily added by slice sweeps
 
@@ -529,4 +530,45 @@ def rename_argument(kernel, old_name, new_name, existing_ok=False):
 
 # }}}
 
+
+# {{{ set temporary scope
+
+def set_temporary_scope(kernel, temp_var_names, scope):
+    """
+    :arg temp_var_names: a container with membership checking,
+        or a comma-separated string of variables for which the
+        scope is to be set.
+    :arg scope: One of the values from :class:`temp_var_scope`, or one
+        of the strings ``"private"``, ``"local"``, or ``"global"``.
+    """
+
+    if isinstance(temp_var_names, str):
+        temp_var_names = [s.strip() for s in temp_var_names.split(",")]
+
+    from loopy.kernel.data import temp_var_scope
+    if isinstance(scope, str):
+        try:
+            scope = getattr(temp_var_scope, scope.upper())
+        except AttributeError:
+            raise LoopyError("scope '%s' unknown" % scope)
+
+    if not isinstance(scope, int) or scope not in [
+            temp_var_scope.PRIVATE,
+            temp_var_scope.LOCAL,
+            temp_var_scope.GLOBAL]:
+        raise LoopyError("invalid scope '%s'" % scope)
+
+    new_temp_vars = kernel.temporary_variables.copy()
+    for tv_name in temp_var_names:
+        try:
+            tv = new_temp_vars[tv_name]
+        except KeyError:
+            raise LoopyError("temporary '%s' not found" % tv_name)
+
+        new_temp_vars[tv_name] = tv.copy(scope=scope)
+
+    return kernel.copy(temporary_variables=new_temp_vars)
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index 6ea0c06e631c5ea321dfd3a11b42f00bb0480078..fd6f33efc95d568fbc553c761c64508f42c4832c 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -240,7 +240,8 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper):
 def precompute(kernel, subst_use, sweep_inames=[], within=None,
         storage_axes=None, temporary_name=None, precompute_inames=None,
         storage_axis_to_tag={}, default_tag="l.auto", dtype=None,
-        fetch_bounding_box=False, temporary_is_local=None,
+        fetch_bounding_box=False,
+        temporary_scope=None, temporary_is_local=None,
         compute_insn_id=None):
     """Precompute the expression described in the substitution rule determined by
     *subst_use* and store it in a temporary array. A precomputation needs two
@@ -316,6 +317,27 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     eliminated.
     """
 
+    # {{{ unify temporary_scope / temporary_is_local
+
+    from loopy.kernel.data import temp_var_scope
+    if temporary_is_local is not None:
+        from warnings import warn
+        warn("temporary_is_local is deprecated. Use temporary_scope instead",
+                DeprecationWarning, stacklevel=2)
+
+        if temporary_scope is not None:
+            raise LoopyError("may not specify both temporary_is_local and "
+                    "temporary_scope")
+
+        if temporary_is_local:
+            temporary_scope = temp_var_scope.LOCAL
+        else:
+            temporary_scope = temp_var_scope.PRIVATE
+
+    del temporary_is_local
+
+    # }}}
+
     # {{{ check, standardize arguments
 
     if isinstance(sweep_inames, str):
@@ -772,8 +794,8 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     import loopy as lp
 
-    if temporary_is_local is None:
-        temporary_is_local = lp.auto
+    if temporary_scope is None:
+        temporary_scope = lp.auto
 
     new_temp_shape = tuple(abm.non1_storage_shape)
 
@@ -784,7 +806,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
                 dtype=dtype,
                 base_indices=(0,)*len(new_temp_shape),
                 shape=tuple(abm.non1_storage_shape),
-                is_local=temporary_is_local,
+                scope=temporary_scope,
                 dim_names=non1_storage_axis_names)
 
     else:
@@ -822,19 +844,20 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
         temp_var = temp_var.copy(shape=new_temp_shape)
 
-        if temporary_is_local == temp_var.is_local:
+        if temporary_scope == temp_var.scope:
             pass
-        elif temporary_is_local is lp.auto:
-            temporary_is_local = temp_var.is_local
-        elif temp_var.is_local is lp.auto:
+        elif temporary_scope is lp.auto:
+            temporary_scope = temp_var.scope
+        elif temp_var.scope is lp.auto:
             pass
         else:
             raise LoopyError("Existing and new temporary '%s' do not "
-                    "have matching values of 'is_local'"
+                    "have matching scopes (existing: %s, new: %s)"
                     % (temporary_name,
-                        temp_var.is_local, temporary_is_local))
+                        temp_var_scope.stringify(temp_var.scope),
+                        temp_var_scope.stringify(temporary_scope)))
 
-        temp_var = temp_var.copy(is_local=temporary_is_local)
+        temp_var = temp_var.copy(scope=temporary_scope)
 
         # }}}
 
diff --git a/loopy/version.py b/loopy/version.py
index ce1cf30894964e17a8831588d7aaac91b9cabb6a..cfaa4b9a01d4eeb57b4cdf1b64a114e3b8c3bb33 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v32-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v33-islpy%s" % _islpy_version
diff --git a/test/test_loopy.py b/test/test_loopy.py
index fb19343c1cc056c0c579e71e60fa8203938e2c4a..1d710bd9d306eacc2df4bfbfe4132cf08aa2dc5e 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2614,7 +2614,7 @@ def test_kernel_splitting_with_loop(ctx_factory):
     knl = lp.add_and_infer_dtypes(knl,
             {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
 
-    ref_knl = knl
+    # ref_knl = knl
 
     knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
 
@@ -2654,7 +2654,6 @@ def test_kernel_splitting_with_loop_and_temporaries(ctx_factory):
 
     knl = lp.add_and_infer_dtypes(knl,
             {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
-
     ref_knl = knl
 
     knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
@@ -2680,6 +2679,34 @@ def test_kernel_splitting_with_loop_and_temporaries(ctx_factory):
     #lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
 
 
+def test_global_temporary(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            "{ [i]: 0<=i<n}",
+            """
+            <> c[i] = a[i + 1]
+            out[i] = c[i]
+            """)
+
+    knl = lp.add_and_infer_dtypes(knl,
+            {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
+    knl = lp.set_temporary_scope(knl, "c", "global")
+
+    ref_knl = knl
+
+    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
+
+    cgr = lp.generate_code_v2(knl)
+
+    assert len(cgr.device_programs) == 2
+
+    #print(cgr.device_code())
+    #print(cgr.host_code())
+
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 2cf537f5ed9c039d09cb1d10066ec9294898d9b9..4bcacf59eb5752800cee2322844d6e33968dc91e 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -28,7 +28,6 @@ from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
 import loopy as lp
-from loopy.statistics import get_op_poly, get_gmem_access_poly, get_barrier_poly
 import numpy as np
 
 
@@ -46,7 +45,7 @@ def test_op_counter_basic():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = get_op_poly(knl)
+    poly = lp.get_op_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -71,7 +70,7 @@ def test_op_counter_reduction():
             name="matmul_serial", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    poly = get_op_poly(knl)
+    poly = lp.get_op_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -93,7 +92,7 @@ def test_op_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    poly = get_op_poly(knl)
+    poly = lp.get_op_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -122,7 +121,7 @@ def test_op_counter_specialops():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = get_op_poly(knl)
+    poly = lp.get_op_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -158,7 +157,7 @@ def test_op_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int64, h=np.int64))
 
-    poly = get_op_poly(knl)
+    poly = lp.get_op_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -197,7 +196,7 @@ def test_op_counter_triangular_domain():
     else:
         expect_fallback = False
 
-    poly = get_op_poly(knl)[(np.dtype(np.float64), 'mul')]
+    poly = lp.get_op_poly(knl)[(np.dtype(np.float64), 'mul')]
     value_dict = dict(m=13, n=200)
     flops = poly.eval_with_dict(value_dict)
 
@@ -221,7 +220,7 @@ def test_gmem_access_counter_basic():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = get_gmem_access_poly(knl)
+    poly = lp.get_gmem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -255,7 +254,7 @@ def test_gmem_access_counter_reduction():
             name="matmul", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    poly = get_gmem_access_poly(knl)
+    poly = lp.get_gmem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -283,7 +282,7 @@ def test_gmem_access_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    poly = get_gmem_access_poly(knl)
+    poly = lp.get_gmem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -317,7 +316,7 @@ def test_gmem_access_counter_specialops():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = get_gmem_access_poly(knl)
+    poly = lp.get_gmem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -358,7 +357,7 @@ def test_gmem_access_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int32, h=np.int32))
 
-    poly = get_gmem_access_poly(knl)
+    poly = lp.get_gmem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -392,7 +391,7 @@ def test_gmem_access_counter_mixed():
     knl = lp.split_iname(knl, "j", threads)
     knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})
 
-    poly = get_gmem_access_poly(knl)  # noqa
+    poly = lp.get_gmem_access_poly(knl)  # noqa
     n = 512
     m = 256
     l = 128
@@ -436,7 +435,7 @@ def test_gmem_access_counter_nonconsec():
     knl = lp.split_iname(knl, "i", 16)
     knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"})
 
-    poly = get_gmem_access_poly(knl)  # noqa
+    poly = lp.get_gmem_access_poly(knl)  # noqa
     n = 512
     m = 256
     l = 128
@@ -475,7 +474,7 @@ def test_gmem_access_counter_consec():
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})
 
-    poly = get_gmem_access_poly(knl)
+    poly = lp.get_gmem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -514,13 +513,13 @@ def test_barrier_counter_nobarriers():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = get_barrier_poly(knl)
+    sync_poly = lp.get_synchronization_poly(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    barrier_count = poly.eval_with_dict(params)
-    assert barrier_count == 0
+    assert len(sync_poly) == 1
+    assert sync_poly["kernel_launch"].eval_with_dict(params) == 1
 
 
 def test_barrier_counter_barriers():
@@ -540,12 +539,13 @@ def test_barrier_counter_barriers():
             )
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32))
     knl = lp.split_iname(knl, "k", 128, outer_tag="g.0", inner_tag="l.0")
-    poly = get_barrier_poly(knl)
+    poly = lp.get_synchronization_poly(knl)
+    print(poly)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    barrier_count = poly.eval_with_dict(params)
+    barrier_count = poly["barrier_local"].eval_with_dict(params)
     assert barrier_count == 50*10*2
 
 
@@ -566,10 +566,11 @@ def test_all_counters_parallel_matmul():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    barrier_count = get_barrier_poly(knl).eval_with_dict(params)
-    assert barrier_count == 0
+    sync_poly = lp.get_synchronization_poly(knl)
+    assert len(sync_poly) == 1
+    assert sync_poly["kernel_launch"].eval_with_dict(params) == 1
 
-    op_map = get_op_poly(knl)
+    op_map = lp.get_op_poly(knl)
     f32mul = op_map[
                         (np.dtype(np.float32), 'mul')
                         ].eval_with_dict(params)
@@ -586,7 +587,7 @@ def test_all_counters_parallel_matmul():
     assert f32mul+f32add == n*m*l*2
     assert i32ops == n*m*l*4 + l*n*4
 
-    subscript_map = get_gmem_access_poly(knl)
+    subscript_map = lp.get_gmem_access_poly(knl)
     f32uncoal = subscript_map[
                         (np.dtype(np.float32), 'nonconsecutive', 'load')
                         ].eval_with_dict(params)