diff --git a/boxtree/dfmm.py b/boxtree/dfmm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebd8350e4d36546368b04cea661669326b592cd2
--- /dev/null
+++ b/boxtree/dfmm.py
@@ -0,0 +1,117 @@
+from __future__ import division
+
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner \
+                 Copyright (C) 2017 Hao Gao"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import logging
+logger = logging.getLogger(__name__)
+
+from mpi4py import MPI
+import numpy as np
+
+def drive_dfmm(traversal, expansion_wrangler, src_weights):
+    
+    #  {{{ Get MPI information
+
+    comm = MPI.COMM_WORLD
+    current_rank = comm.Get_rank()
+    total_rank = comm.Get_size()
+
+    # }}}
+
+    # {{{ Distribute tree parameters
+
+    if current_rank == 0:
+        tree = traversal.tree
+        # TODO: distribute more parameters of the tree
+        parameters = {"sources_are_targets": tree.sources_are_targets,
+                      "sources_have_extent": tree.sources_have_extent,
+                      "nsources":tree.nsources,
+                      "nboxes":tree.box_source_starts.shape[0], 
+                      "dimensions":tree.sources.shape[0], 
+                      "coord_dtype":tree.coord_dtype,
+                      "box_id_dtype":tree.box_id_dtype}
+    else:
+        parameters = None
+    parameters = comm.bcast(parameters, root=0)
+    
+    # }}}
+
+    # {{{ Fill tree parameters to the locally essentail tree
+
+    from boxtree import Tree
+    letree = Tree()
+    # TODO: add more parameters to the locally essential tree
+    letree.sources_are_targets = parameters["sources_are_targets"]
+
+    # }}}
+
+    # {{{ Construct locally essential tree mask for each rank
+
+    # Problem: Current implementation divides all boxes evenly across all 
+    # ranks. This scheme is subject to significant load imbalance. A better way to do 
+    # this is to assign a weight to each box according to its interaction list, and then 
+    # divides boxes evenly by the total weights.
+
+    if current_rank == 0:
+        # mask[i][j] is true iff box j is in the locally essential tree of rank i
+        mask = np.zeros((total_rank, parameters["nboxes"]), dtype=bool)
+        num_boxes_per_rank = (parameters["nboxes"] + total_rank - 1) // total_rank
+        
+        for i in range(total_rank):
+            # Get the start and end box index for rank i 
+            box_start_idx = num_boxes_per_rank * i
+            if current_rank == total_rank - 1:
+                box_end_idx = parameters["nboxes"]
+            else:
+                box_end_idx = num_boxes_per_rank * (i + 1)
+
+            # Mark all ancestors of boxes of rank i
+            new_mask = np.zeros(parameters["nboxes"], dtype=bool)
+            new_mask[box_start_idx:box_end_idx] = True
+            while np.count_nonzero(new_mask) != 0:
+                np.logical_or(mask[i, :], new_mask, out=mask[i, :])
+                new_mask_idx = np.nonzero(new_mask)
+                new_mask_parent_idx = tree.box_parent_ids[new_mask_idx]
+                new_mask[:] = False
+                new_mask[new_mask_parent_idx] = True
+                new_mask = np.logical_and(new_mask, np.logical_not(mask[i, :]), 
+                                          out=new_mask)
+            
+            # Generate interaction list mask for mask[i, :]
+            interaction_mask = np.zeros(parameters["nboxes"], dtype=bool)
+            box_indices = np.nonzero(mask[i, :])
+            for j in range(len(box_indices)):
+                box_index = box_indices[j]
+                # List 1
+                start, end = traversal.neighbor_source_boxes_starts[box_index:box_index + 2]
+                list1_idx = traversal.neighbor_source_boxes_lists[start:end]
+                interaction_mask[list1_idx] = True
+                # List 2
+                start, end = traversal.from_sep_siblings_starts[box_index:box_index + 2]
+                list2_idx = traversal.from_sep_siblings_lists[start:end]
+                interaction_mask[list2_idx] = True
+                # List 3
+                
+
+    # }}}
diff --git a/test/test_dfmm.py b/test/test_dfmm.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a6c72422cab87384a80b74af2dc38e42371a560
--- /dev/null
+++ b/test/test_dfmm.py
@@ -0,0 +1,80 @@
+import numpy as np
+import sys
+from mpi4py import MPI
+
+# Parameters
+dims = 2
+nsources = 30
+ntargets = 10
+dtype = np.float64
+
+# Get the current rank
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+
+# Initialization
+trav = None
+sources_weights = None
+wrangler = None
+
+# Generate particles and run shared-memory parallelism on rank 0
+if rank == 0:
+    # Configure PyOpenCL
+    import pyopencl as cl
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+
+    # Generate random particles and source weights
+    from boxtree.tools import make_normal_particle_array as p_normal
+    sources = p_normal(queue, nsources, dims, dtype, seed=15)
+    targets = p_normal(queue, ntargets, dims, dtype, seed=18) + np.array([2, 0, 0])[:dims]
+
+    from boxtree.tools import particle_array_to_host
+    sources_host = particle_array_to_host(sources)
+    targets_host = particle_array_to_host(targets)
+
+    from pyopencl.clrandom import PhiloxGenerator
+    rng = PhiloxGenerator(queue.context, seed=20)
+    sources_weights = rng.uniform(queue, nsources, dtype=np.float64).get()
+
+    # Display sources and targets
+    if "--display" in sys.argv:
+        import matplotlib.pyplot as plt
+        plt.plot(sources_host[:, 0], sources_host[:, 1], "bo")
+        plt.plot(targets_host[:, 0], targets_host[:, 1], "ro")
+        plt.show()
+
+    # Calculate potentials using direct evaluation
+    import numpy.linalg as la
+    distances = la.norm(sources_host.reshape(1, nsources, 2) - \
+                        targets_host.reshape(ntargets, 1, 2), 
+                        ord=2, axis=2)
+    pot_naive = np.sum(-np.log(distances)*sources_weights, axis=1)
+
+    # Build the tree and interaction lists
+    from boxtree import TreeBuilder
+    tb = TreeBuilder(ctx)
+    tree, _ = tb(queue, sources, targets=targets, max_particles_in_box=30, debug=True)
+
+    from boxtree.traversal import FMMTraversalBuilder
+    tg = FMMTraversalBuilder(ctx)
+    trav, _ = tg(queue, tree, debug=True)
+    trav = trav.get(queue=queue)
+
+    # Get pyfmmlib expansion wrangler
+    from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler
+    def fmm_level_to_nterms(tree, level):
+        return 20
+    wrangler = FMMLibExpansionWrangler(trav.tree, 0, fmm_level_to_nterms=fmm_level_to_nterms)
+
+    # Compute FMM using shared memory parallelism
+    from boxtree.fmm import drive_fmm
+    pot_fmm = drive_fmm(trav, wrangler, sources_weights)* 2 * np.pi
+    print(la.norm(pot_fmm - pot_naive, ord=2))
+
+# Compute FMM using distributed memory parallelism
+from boxtree.dfmm import drive_dfmm
+# Note: The drive_dfmm interface works as follows: 
+# Rank 0 passes the correct trav, wrangler, and sources_weights
+# All other ranks pass None to these arguments
+pot_dfmm = drive_dfmm(trav, wrangler, sources_weights)