diff --git a/include/ALL.hpp b/include/ALL.hpp
index e2aa64615c747f03e42b61c4eedce6c122d88f8a..6b9455c84c262a087788b7e4e6865957792f8665 100644
--- a/include/ALL.hpp
+++ b/include/ALL.hpp
@@ -41,6 +41,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "ALL_ForceBased.hpp"
 #include "ALL_Histogram.hpp"
 #include "ALL_Staggered.hpp"
+#include "ALL_Iterative.hpp"
 #include <algorithm>
 #include <iomanip>
 #include <memory>
@@ -72,871 +73,919 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-namespace ALL {
+namespace ALL
+{
 
 #define ALL_ESTIMATION_LIMIT_DEFAULT 0
 
-/// enum type to describe the different balancing methods
-enum LB_t : int {
-  /// staggered grid load balancing
-  STAGGERED = 0,
-  /// tensor based load balancing
-  TENSOR = 1,
-  /// unstructured-mesh load balancing
-  FORCEBASED = 2,
+  /// enum type to describe the different balancing methods
+  enum LB_t : int
+  {
+    /// staggered grid load balancing
+    STAGGERED = 0,
+    /// tensor based load balancing
+    TENSOR = 1,
+    /// unstructured-mesh load balancing
+    FORCEBASED = 2,
 #ifdef ALL_VORONOI_ACTIVE
-  /// voronoi cell based load balancing
-  VORONOI = 3,
+    /// voronoi cell based load balancing
+    VORONOI = 3,
 #endif
-  /// histogram based load balancing
-  HISTOGRAM = 4,
-  /// tensor based load balancing using maximum values
-  TENSOR_MAX = 5,
-  /// Unimplemented load balancing NEVER SUPPORTED
-  UNIMPLEMENTED = 128
-};
-
-/// @tparam T data type for vertices and related data
-/// @tparam W data type for work and related data
-
-template <class T, class W> class ALL {
-public:
-  /// default constructor, that sets up internal data structures and initializes
-  /// internal values shared between all balancing methods
-  ALL()
-      : loadbalancing_step(0)
+    /// histogram based load balancing
+    HISTOGRAM = 4,
+    /// tensor based load balancing using maximum values
+    TENSOR_MAX = 5,
+    /// Unimplemented load balancing NEVER SUPPORTED
+    UNIMPLEMENTED = 128
+  };
+
+  /// @tparam T data type for vertices and related data
+  /// @tparam W data type for work and related data
+
+  template <class T, class W>
+  class ALL
+  {
+  public:
+    /// default constructor, that sets up internal data structures and initializes
+    /// internal values shared between all balancing methods
+    ALL()
+        : loadbalancing_step(0)
 #ifdef ALL_VTK_OUTPUT
-        ,
-        vtk_init(false)
+          ,
+          vtk_init(false)
 #endif
-  {
-    // determine correct MPI data type for template T
-    if (std::is_same<T, double>::value)
-      MPIDataTypeT = MPI_DOUBLE;
-    else if (std::is_same<T, float>::value)
-      MPIDataTypeT = MPI_FLOAT;
-    else if (std::is_same<T, int>::value)
-      MPIDataTypeT = MPI_INT;
-    else if (std::is_same<T, long>::value)
-      MPIDataTypeT = MPI_LONG;
-
-    // determine correct MPI data type for template W
-    if (std::is_same<W, double>::value)
-      MPIDataTypeW = MPI_DOUBLE;
-    else if (std::is_same<W, float>::value)
-      MPIDataTypeW = MPI_FLOAT;
-    else if (std::is_same<W, int>::value)
-      MPIDataTypeW = MPI_INT;
-    else if (std::is_same<W, long>::value)
-      MPIDataTypeW = MPI_LONG;
-  }
-
-  /// constructor providing the balancing method to use, the dimensions of the
-  /// vertices and the gamma correction value and setting the chosen balancing
-  /// methods up
-  /// @param[in] m the balancing method to be used
-  /// @param[in] d the dimension of the vertices to be used (most methods
-  /// currently only support 3D vertices)
-  /// @param[in] g the value used for the correction value, if required by the
-  /// chosen balancing method
-  ALL(const LB_t m, const int d, const T g) : ALL() {
-    method = m;
-    switch (method) {
-    case LB_t::TENSOR:
-      balancer.reset(new Tensor_LB<T, W>(d, (W)0, g));
-      break;
-    case LB_t::STAGGERED:
-      balancer.reset(new Staggered_LB<T, W>(d, (W)0, g));
-      break;
-    case LB_t::FORCEBASED:
-      balancer.reset(new ForceBased_LB<T, W>(d, (W)0, g));
-      break;
+    {
+      // determine correct MPI data type for template T
+      if (std::is_same<T, double>::value)
+        MPIDataTypeT = MPI_DOUBLE;
+      else if (std::is_same<T, float>::value)
+        MPIDataTypeT = MPI_FLOAT;
+      else if (std::is_same<T, int>::value)
+        MPIDataTypeT = MPI_INT;
+      else if (std::is_same<T, long>::value)
+        MPIDataTypeT = MPI_LONG;
+
+      // determine correct MPI data type for template W
+      if (std::is_same<W, double>::value)
+        MPIDataTypeW = MPI_DOUBLE;
+      else if (std::is_same<W, float>::value)
+        MPIDataTypeW = MPI_FLOAT;
+      else if (std::is_same<W, int>::value)
+        MPIDataTypeW = MPI_INT;
+      else if (std::is_same<W, long>::value)
+        MPIDataTypeW = MPI_LONG;
+    }
+
+    /// constructor providing the balancing method to use, the dimensions of the
+    /// vertices and the gamma correction value and setting the chosen balancing
+    /// methods up
+    /// @param[in] m the balancing method to be used
+    /// @param[in] d the dimension of the vertices to be used (most methods
+    /// currently only support 3D vertices)
+    /// @param[in] g the value used for the correction value, if required by the
+    /// chosen balancing method
+    ALL(const LB_t m, const int d, const T g) : ALL()
+    {
+      method = m;
+      switch (method)
+      {
+      case LB_t::TENSOR:
+        balancer.reset(new Tensor_LB<T, W>(d, (W)0, g));
+        break;
+      case LB_t::STAGGERED:
+        balancer.reset(new Staggered_LB<T, W>(d, (W)0, g));
+        break;
+      case LB_t::FORCEBASED:
+        balancer.reset(new ForceBased_LB<T, W>(d, (W)0, g));
+        break;
 #ifdef ALL_VORONOI_ACTIVE
-    case LB_t::VORONOI:
-      balancer.reset(new Voronoi_LB<T, W>(d, (W)0, g));
-      break;
+      case LB_t::VORONOI:
+        balancer.reset(new Voronoi_LB<T, W>(d, (W)0, g));
+        break;
 #endif
-    case LB_t::HISTOGRAM:
-      balancer.reset(new Histogram_LB<T, W>(d, std::vector<W>(10), g));
-      break;
-    case LB_t::TENSOR_MAX:
-      balancer.reset(new TensorMax_LB<T, W>(d, (W)0, g));
-      break;
-    default:
-      throw InvalidArgumentException(__FILE__, __func__, __LINE__,
-                                     "Unknown type of loadbalancing passed.");
+      case LB_t::HISTOGRAM:
+        balancer.reset(new Histogram_LB<T, W>(d, std::vector<W>(10), g));
+        break;
+      case LB_t::TENSOR_MAX:
+        balancer.reset(new TensorMax_LB<T, W>(d, (W)0, g));
+        break;
+      default:
+        throw InvalidArgumentException(__FILE__, __func__, __LINE__,
+                                       "Unknown type of loadbalancing passed.");
+      }
+      balancer->setDimension(d);
+      balancer->setGamma(g);
+      estimation_limit = ALL_ESTIMATION_LIMIT_DEFAULT;
     }
-    balancer->setDimension(d);
-    balancer->setGamma(g);
-    estimation_limit = ALL_ESTIMATION_LIMIT_DEFAULT;
-  }
-
-  /// constructor to setup the method, the dimension of the used vertices, the
-  /// correction value and also providing a first set of vertices to start from
-  /// @param[in] m the balancing method to be used
-  /// @param[in] d the dimension of the vertices to be used (most methods
-  /// currently only support 3D vertices)
-  /// @param[in] inp the set of vertices to be used in the
-  /// balancing step
-  /// @param[in] g the value used for the correction value, if required by the
-  /// chosen balancing method
-  ALL(const LB_t m, const int d, const std::vector<Point<T>> &inp, const T g)
-      : ALL(m, d, g) {
-    balancer->setVertices(inp);
-    calculate_outline();
-  }
-
-  /// destructor
-  ~ALL() = default;
-
-  /// method to provide a new set of vertices
-  /// @param[in] inp reference to a vector of ALL::Point objects describing
-  /// the vertices to be used in the balancing step
-  void setVertices(const std::vector<Point<T>> &inp) {
-    int dim = inp.at(0).getDimension();
-    if (dim != balancer->getDimension())
-      throw PointDimensionMissmatchException(
-          __FILE__, __func__, __LINE__,
-          "Dimension of ALL::Points in input vector do not match dimension of "
-          "ALL "
-          "object.");
-    balancer->setVertices(inp);
-    calculate_outline();
-  }
-
-  /// method to set the communicator to be used in the balancing step, if a non
-  /// cartesian communicator is provided, a cartesian communicator is created as
-  /// a copy and used internally
-  /// @param[in] comm the communicator to be used
-  void setCommunicator(const MPI_Comm comm) {
-    int comm_type;
-    MPI_Topo_test(comm, &comm_type);
-    if (comm_type == MPI_CART) {
-      communicator = comm;
-      balancer->setCommunicator(communicator);
-      // set procGridLoc and procGridDim
-      const int dimension = balancer->getDimension();
-      int location[dimension];
-      int size[dimension];
-      int periodicity[dimension];
-      MPI_Cart_get(communicator, dimension, size, periodicity, location);
-      procGridLoc.assign(location, location + dimension);
-      procGridDim.assign(size, size + dimension);
-      procGridSet = true;
-    } else {
-      int rank;
-      MPI_Comm_rank(comm, &rank);
-      if (procGridSet) {
-        // compute 1D index from the location in the process grid (using z-y-x
-        // ordering, as MPI_Dims_create)
-        int idx;
-        switch (balancer->getDimension()) {
-        case 3:
-          idx = procGridLoc.at(2) + procGridLoc.at(1) * procGridDim.at(2) +
-                procGridLoc.at(0) * procGridDim.at(2) * procGridDim.at(1);
-          break;
-        case 2:
-          idx = procGridLoc.at(1) + procGridLoc.at(0) * procGridDim.at(1);
-          break;
-        case 1:
-          idx = procGridLoc.at(0);
-          break;
-        default:
-          throw InternalErrorException(
-              __FILE__, __func__, __LINE__,
-              "ALL cannot deal with more then three dimensions (or less then "
-              "one).");
-          break;
-        }
 
-        // create new temporary communicator with correct ranks
-        MPI_Comm temp_comm;
-        MPI_Comm_split(comm, 0, idx, &temp_comm);
+    /// constructor to setup the method, the dimension of the used vertices, the
+    /// correction value and also providing a first set of vertices to start from
+    /// @param[in] m the balancing method to be used
+    /// @param[in] d the dimension of the vertices to be used (most methods
+    /// currently only support 3D vertices)
+    /// @param[in] inp the set of vertices to be used in the
+    /// balancing step
+    /// @param[in] g the value used for the correction value, if required by the
+    /// chosen balancing method
+    ALL(const LB_t m, const int d, const std::vector<Point<T>> &inp, const T g)
+        : ALL(m, d, g)
+    {
+      balancer->setVertices(inp);
+      calculate_outline();
+    }
 
-        std::vector<int> periods(3, 1);
+    /// destructor
+    ~ALL() = default;
+
+    /// method to provide a new set of vertices
+    /// @param[in] inp reference to a vector of ALL::Point objects describing
+    /// the vertices to be used in the balancing step
+    void setVertices(const std::vector<Point<T>> &inp)
+    {
+      int dim = inp.at(0).getDimension();
+      if (dim != balancer->getDimension())
+        throw PointDimensionMissmatchException(
+            __FILE__, __func__, __LINE__,
+            "Dimension of ALL::Points in input vector do not match dimension of "
+            "ALL "
+            "object.");
+      balancer->setVertices(inp);
+      calculate_outline();
+    }
 
-        // transform temporary communicator to cartesian communicator
-        MPI_Cart_create(temp_comm, balancer->getDimension(), procGridDim.data(),
-                        periods.data(), 0, &communicator);
+    /// method to set the communicator to be used in the balancing step, if a non
+    /// cartesian communicator is provided, a cartesian communicator is created as
+    /// a copy and used internally
+    /// @param[in] comm the communicator to be used
+    void setCommunicator(const MPI_Comm comm)
+    {
+      int comm_type;
+      MPI_Topo_test(comm, &comm_type);
+      if (comm_type == MPI_CART)
+      {
+        communicator = comm;
         balancer->setCommunicator(communicator);
-      } else {
-        throw InternalErrorException(
-            __FILE__, __func__, __LINE__,
-            "When using a non-cartesian communicator process grid parameters "
-            "required (not set).");
+        // set procGridLoc and procGridDim
+        const int dimension = balancer->getDimension();
+        int location[dimension];
+        int size[dimension];
+        int periodicity[dimension];
+        MPI_Cart_get(communicator, dimension, size, periodicity, location);
+        procGridLoc.assign(location, location + dimension);
+        procGridDim.assign(size, size + dimension);
+        procGridSet = true;
       }
-    }
-  }
-
-  /// method the get the correction value used in the balancing method
-  ///@result T the correction value for the balancing method
-  T getGamma() { return balancer->getGamma(); };
-
-  /// method to set the correction value to be used in the balancing method
-  /// @param[in] g the correctin value to be used
-  void setGamma(const T g) { balancer->setGamma(g); };
-
-  /// method to set a scalar work for the local domain
-  /// @param[in] work the scalar work for the local domain
-  void setWork(const W work) {
-    // set new value for work (single value for whole domain)
-    balancer->setWork(work);
-  }
-
-  // method to set a multi-dimensional work for the local domain
-  ///@param[in] work std::vector<W> containing the work for the local domain
-  void setWork(const std::vector<W> &work) { balancer->setWork(work); }
-
-  /// method to call the setup of the chosen balancing method (not all methods
-  /// require a setup)
-  void setup() { balancer->setup(); }
-
-  /// method the trigger the balancing step, that updates the vertices according
-  /// to the previously provided work and chosen method
-  /// @param[in] internal toggles if internal steps are performed, needed
-  /// for some recursive calls of the routine by some methods
-  /// @result std::vector<ALL::Point<T>> containing the shifted set of vertices
-  std::vector<Point<T>> &balance() {
-    nVertices = balancer->getNVertices();
-    switch (method) {
-    case LB_t::TENSOR:
-      balancer->balance(loadbalancing_step);
-      calculate_outline();
-      break;
-    case LB_t::STAGGERED:
+      else
+      {
+        int rank;
+        MPI_Comm_rank(comm, &rank);
+        if (procGridSet)
+        {
+          // compute 1D index from the location in the process grid (using z-y-x
+          // ordering, as MPI_Dims_create)
+          int idx;
+          switch (balancer->getDimension())
+          {
+          case 3:
+            idx = procGridLoc.at(2) + procGridLoc.at(1) * procGridDim.at(2) +
+                  procGridLoc.at(0) * procGridDim.at(2) * procGridDim.at(1);
+            break;
+          case 2:
+            idx = procGridLoc.at(1) + procGridLoc.at(0) * procGridDim.at(1);
+            break;
+          case 1:
+            idx = procGridLoc.at(0);
+            break;
+          default:
+            throw InternalErrorException(
+                __FILE__, __func__, __LINE__,
+                "ALL cannot deal with more then three dimensions (or less then "
+                "one).");
+            break;
+          }
 
-      /* ToDo: Check if repetition is useful at all and
-       *       change it to be included for all methods,
-       *       not only STAGGERED */
-      /*
-      // estimation to improve speed of convergence
-      // changing vertices during estimation
-      std::vector<Point<T>> tmp_vertices = balancer->getVertices();
-      // saved original vertices
-      std::vector<Point<T>> old_vertices = balancer->getVertices();
-      // old temporary vertices
-      std::vector<Point<T>> old_tmp_vertices = balancer->getVertices();
-      // result temporary vertices
-      std::vector<Point<T>> result_vertices = balancer->getVertices();
-      // temporary work
-      W tmp_work = balancer->getWork().at(0);
-      // old temporary work
-      W old_tmp_work = balancer->getWork().at(0);
-
-      W max_work;
-      W sum_work;
-      double d_max;
-      int n_ranks;
+          // create new temporary communicator with correct ranks
+          MPI_Comm temp_comm;
+          MPI_Comm_split(comm, 0, idx, &temp_comm);
 
-      // compute work density on local domain
-      double V = 1.0;
-      for (int i = 0; i < balancer->getDimension(); ++i)
-          V *= ( outline.at(1)[i] - outline.at(0)[i] );
-      double rho = workArray.at(0) / V;
+          std::vector<int> periods(3, 1);
 
-      // collect maximum work in system
-      MPI_Allreduce(workArray.data(), &max_work, 1, MPIDataTypeW, MPI_MAX,
-      communicator); MPI_Allreduce(workArray.data(), &sum_work, 1,
-      MPIDataTypeW, MPI_SUM, communicator);
-      MPI_Comm_size(communicator,&n_ranks);
-      d_max = (double)(max_work) * (double)n_ranks / (double)sum_work - 1.0;
+          // transform temporary communicator to cartesian communicator
+          MPI_Cart_create(temp_comm, balancer->getDimension(), procGridDim.data(),
+                          periods.data(), 0, &communicator);
+          balancer->setCommunicator(communicator);
+        }
+        else
+        {
+          throw InternalErrorException(
+              __FILE__, __func__, __LINE__,
+              "When using a non-cartesian communicator process grid parameters "
+              "required (not set).");
+        }
+      }
+    }
 
+    /// method the get the correction value used in the balancing method
+    ///@result T the correction value for the balancing method
+    T getGamma() { return balancer->getGamma(); };
 
-      // internal needs to be readded to argument list
-      const bool internal = false
-      for (int i = 0; i < estimation_limit && d_max > 0.1 && !internal; ++i)
+    /// method to set the correction value to be used in the balancing method
+    /// @param[in] g the correctin value to be used
+    void setGamma(const T g) { balancer->setGamma(g); };
+
+    /// method to set a scalar work for the local domain
+    /// @param[in] work the scalar work for the local domain
+    void setWork(const W work)
+    {
+      // set new value for work (single value for whole domain)
+      balancer->setWork(work);
+    }
+
+    // method to set a multi-dimensional work for the local domain
+    ///@param[in] work std::vector<W> containing the work for the local domain
+    void setWork(const std::vector<W> &work) { balancer->setWork(work); }
+
+    /// method to call the setup of the chosen balancing method (not all methods
+    /// require a setup)
+    void setup() { balancer->setup(); }
+
+    /// method the trigger the balancing step, that updates the vertices according
+    /// to the previously provided work and chosen method
+    /// @param[in] internal toggles if internal steps are performed, needed
+    /// for some recursive calls of the routine by some methods
+    /// @result std::vector<ALL::Point<T>> containing the shifted set of vertices
+    std::vector<Point<T>> &balance()
+    {
+      nVertices = balancer->getNVertices();
+      switch (method)
       {
-          balancer->setWork(tmp_work);
-          balancer->setup();
-          balancer->balance(true);
-          bool sane = true;
-          // check if the estimated boundaries are not too deformed
-          for (int j = 0; j < balancer->getDimension(); ++j)
-              sane = sane && (result_vertices.at(0)[j] <=
-      old_vertices.at(1)[j]);
-          MPI_Allreduce(MPI_IN_PLACE,&sane,1,MPI_CXX_BOOL,MPI_LAND,communicator);
-          if (sane)
-          {
-              old_tmp_vertices = tmp_vertices;
-              tmp_vertices = result_vertices;
-              V = 1.0;
-              for (int i = 0; i < balancer->getDimension(); ++i)
-                  V *= ( tmp_vertices.at(1)[i] - tmp_vertices.at(0)[i] );
-              old_tmp_work = tmp_work;
-              tmp_work = rho * V;
-          }
-          else if (!sane || i == estimation_limit - 1)
-          {
-              balancer->getVertices() = old_tmp_vertices;
-              workArray->at(0) = old_tmp_work;
-              calculate_outline();
-              i = estimation_limit;
-          }
-      }
+      case LB_t::TENSOR:
+        balancer->balance(loadbalancing_step);
+        calculate_outline();
+        break;
+      case LB_t::STAGGERED:
+
+        /* ToDo: Check if repetition is useful at all and
+         *       change it to be included for all methods,
+         *       not only STAGGERED */
+        /*
+        // estimation to improve speed of convergence
+        // changing vertices during estimation
+        std::vector<Point<T>> tmp_vertices = balancer->getVertices();
+        // saved original vertices
+        std::vector<Point<T>> old_vertices = balancer->getVertices();
+        // old temporary vertices
+        std::vector<Point<T>> old_tmp_vertices = balancer->getVertices();
+        // result temporary vertices
+        std::vector<Point<T>> result_vertices = balancer->getVertices();
+        // temporary work
+        W tmp_work = balancer->getWork().at(0);
+        // old temporary work
+        W old_tmp_work = balancer->getWork().at(0);
+
+        W max_work;
+        W sum_work;
+        double d_max;
+        int n_ranks;
+
+        // compute work density on local domain
+        double V = 1.0;
+        for (int i = 0; i < balancer->getDimension(); ++i)
+            V *= ( outline.at(1)[i] - outline.at(0)[i] );
+        double rho = workArray.at(0) / V;
+
+        // collect maximum work in system
+        MPI_Allreduce(workArray.data(), &max_work, 1, MPIDataTypeW, MPI_MAX,
+        communicator); MPI_Allreduce(workArray.data(), &sum_work, 1,
+        MPIDataTypeW, MPI_SUM, communicator);
+        MPI_Comm_size(communicator,&n_ranks);
+        d_max = (double)(max_work) * (double)n_ranks / (double)sum_work - 1.0;
+
+
+        // internal needs to be readded to argument list
+        const bool internal = false
+        for (int i = 0; i < estimation_limit && d_max > 0.1 && !internal; ++i)
+        {
+            balancer->setWork(tmp_work);
+            balancer->setup();
+            balancer->balance(true);
+            bool sane = true;
+            // check if the estimated boundaries are not too deformed
+            for (int j = 0; j < balancer->getDimension(); ++j)
+                sane = sane && (result_vertices.at(0)[j] <=
+        old_vertices.at(1)[j]);
+            MPI_Allreduce(MPI_IN_PLACE,&sane,1,MPI_CXX_BOOL,MPI_LAND,communicator);
+            if (sane)
+            {
+                old_tmp_vertices = tmp_vertices;
+                tmp_vertices = result_vertices;
+                V = 1.0;
+                for (int i = 0; i < balancer->getDimension(); ++i)
+                    V *= ( tmp_vertices.at(1)[i] - tmp_vertices.at(0)[i] );
+                old_tmp_work = tmp_work;
+                tmp_work = rho * V;
+            }
+            else if (!sane || i == estimation_limit - 1)
+            {
+                balancer->getVertices() = old_tmp_vertices;
+                workArray->at(0) = old_tmp_work;
+                calculate_outline();
+                i = estimation_limit;
+            }
+        }
 
-      ((Staggered_LB<T,W>*)balancer.get())->setVertices(outline->data());
-      */
-      balancer->balance(loadbalancing_step);
-      calculate_outline();
-      break;
-    case LB_t::FORCEBASED:
-      balancer->balance(loadbalancing_step);
-      calculate_outline();
-      break;
+        ((Staggered_LB<T,W>*)balancer.get())->setVertices(outline->data());
+        */
+        balancer->balance(loadbalancing_step);
+        calculate_outline();
+        break;
+      case LB_t::FORCEBASED:
+        balancer->balance(loadbalancing_step);
+        calculate_outline();
+        break;
 #ifdef ALL_VORONOI_ACTIVE
-    case LB_t::VORONOI:
-      balancer->balance(loadbalancing_step);
-      break;
+      case LB_t::VORONOI:
+        balancer->balance(loadbalancing_step);
+        break;
 #endif
-    case LB_t::HISTOGRAM:
-      balancer->balance(loadbalancing_step);
-      calculate_outline();
-      break;
-    case LB_t::TENSOR_MAX:
-      balancer->balance(loadbalancing_step);
-      calculate_outline();
-      break;
-
-    default:
-      throw InvalidArgumentException(__FILE__, __func__, __LINE__,
-                                     "Unknown type of loadbalancing passed.");
+      case LB_t::HISTOGRAM:
+        balancer->balance(loadbalancing_step);
+        calculate_outline();
+        break;
+      case LB_t::TENSOR_MAX:
+        balancer->balance(loadbalancing_step);
+        calculate_outline();
+        break;
+
+      default:
+        throw InvalidArgumentException(__FILE__, __func__, __LINE__,
+                                       "Unknown type of loadbalancing passed.");
+      }
+      loadbalancing_step++;
+      return balancer->getVertices();
     }
-    loadbalancing_step++;
-    return balancer->getVertices();
-  }
-
-  /**********************/
-  /*  getter functions  */
-  /**********************/
-
-  /// get the vertices before performing the load-balancing step
-  /// @result std::vector<ALL::Point<T>> containing the vertices before the
-  /// balancing step
-  std::vector<Point<T>> &getPrevVertices() {
-    return balancer->getPrevVertices();
-  };
 
-  /// get the resulting vertices after the load-balancing step, if it has been
-  /// performed
-  /// @result std::vector<ALL::Point<T>> containing the resulting vertices after
-  /// the balancing step
-  std::vector<Point<T>> &getVertices() { return balancer->getVertices(); };
-
-  /// get the dimension of the ALL::Point<T> objects used to describe the
-  /// vertices of the domains
-  /// @result int containing the dimension of the vertices
-  int getDimension() { return balancer->getDimension(); }
-
-  /// method to get the work provided to the method
-  /// @param[out] result reference to std::vector<W> to store the vector of work
-  /// if an array of work was provided, e.g. for the histogram method
-  void getWork(std::vector<W> &result) { result = balancer->getWork(); }
-
-  /// method to get the work provided to the method
-  /// @result scalar work or first value of vector work
-  W getWork() { return balancer->getWork().at(0); }
-
-  /// method to provide a list of the ranks of the neighbors the local domain
-  /// has in all directions
-  /// @result vector if neighboring ranks
-  std::vector<int> &getNeighbors() { return balancer->getNeighbors(); }
-
-  /// method to provide a list of neighboring vertices, e.g. required for
-  /// VORONOI
-  /// @result vector of neighboring vertices
-  /// neighboring vertices are stored in
-  std::vector<T> &getNeighborVertices() {
-    return balancer->getNeighborVertices();
-  }
-
-  /// method to provide the current load-balancing efficiency
-  /// @result the current LB efficiency (only valid before balance() was called)
-  W getEfficiency(){
-    return balancer->getEfficiency();
-  }
-
-  /// method to provide an estimated work efficiency after the balancing
-  /// @result the estimated LB efficieny (only valid after balance() was called)
-  W getEstimatedEfficiency() {
-    // currently only implemented for HISTOGRAM
-    return balancer->getEstimatedEfficiency();
-  }
-
-  /**********************/
-  /*  setter functions  */
-  /**********************/
-
-  /// method to set the size of the system, e.g. required for the HISTOGRAM
-  /// balancing method
-  /// @param[in] sysSize reference to a vector of T containing the size of
-  /// the orthogonal system in the following format: (x_min, x_max, y_min,
-  /// y_max, z_min, z_max)
-  void setSysSize(const std::vector<T> &sysSize) {
-    balancer.get()->setSysSize(sysSize);
-  }
-
-  /// method to set method specific data, that is not required by all different
-  /// balancing methods
-  /// @param[in] data pointer to a struct or other data object in the format the
-  /// methods requires
-  ///
-  /// For the histogram method the number of bins per dimensions can be given
-  /// as a C array of `int`s. The length of the array must be the number of
-  /// dimensions given to the load balancer. BE CAREFUL this is not enforced!
-  void setMethodData(const void *data) {
-    balancer.get()->setAdditionalData(data);
-  }
-
-  /// method to set the parameters of the used cartesian communicator
-  /// @param[in] loc the cartesian coordinates of the local domain in the
-  /// process grid
-  /// @param[in] size the size of the cartesian process grid
-  void setProcGridParams(const std::vector<int> &loc,
-                         const std::vector<int> &size) {
-    // todo(s.schulz): We should probably throw if the dimension does not match
-    procGridLoc.insert(procGridLoc.begin(), loc.begin(), loc.end());
-    procGridDim.insert(procGridDim.begin(), size.begin(), size.end());
-    procGridSet = true;
-  }
-
-  /// method to set the process tag output in VTK output
-  /// @param[in] tag
-  void setProcTag(int tag) { procTag = tag; }
-
-  /// method the set the minimum domain sizes in all directions, can be required
-  /// if using linked cell algorithms and wanting to prevent data exchange with
-  /// next-near neighbors instead of only with next neighbors
-  /// @param[in] minSize the minimum size of a domain in each dimension
-  void setMinDomainSize(const std::vector<T> &minSize) {
-    (balancer.get())->setMinDomainSize(minSize);
-  }
+    /**********************/
+    /*  getter functions  */
+    /**********************/
+
+    /// get the vertices before performing the load-balancing step
+    /// @result std::vector<ALL::Point<T>> containing the vertices before the
+    /// balancing step
+    std::vector<Point<T>> &getPrevVertices()
+    {
+      return balancer->getPrevVertices();
+    };
+
+    /// get the resulting vertices after the load-balancing step, if it has been
+    /// performed
+    /// @result std::vector<ALL::Point<T>> containing the resulting vertices after
+    /// the balancing step
+    std::vector<Point<T>> &getVertices() { return balancer->getVertices(); };
+
+    /// get the dimension of the ALL::Point<T> objects used to describe the
+    /// vertices of the domains
+    /// @result int containing the dimension of the vertices
+    int getDimension() { return balancer->getDimension(); }
+
+    /// method to get the work provided to the method
+    /// @param[out] result reference to std::vector<W> to store the vector of work
+    /// if an array of work was provided, e.g. for the histogram method
+    void getWork(std::vector<W> &result) { result = balancer->getWork(); }
+
+    /// method to get the work provided to the method
+    /// @result scalar work or first value of vector work
+    W getWork() { return balancer->getWork().at(0); }
+
+    /// method to provide a list of the ranks of the neighbors the local domain
+    /// has in all directions
+    /// @result vector if neighboring ranks
+    std::vector<int> &getNeighbors() { return balancer->getNeighbors(); }
+
+    /// method to provide a list of neighboring vertices, e.g. required for
+    /// VORONOI
+    /// @result vector of neighboring vertices
+    /// neighboring vertices are stored in
+    std::vector<T> &getNeighborVertices()
+    {
+      return balancer->getNeighborVertices();
+    }
 
-#ifdef ALL_VTK_OUTPUT
-  /// method to create VTK based output of the domains used in the
-  /// load-balancing process (for now only orthogonal domains are supported)
-  /// @param[in] step the number of the loadbalancing step used for numbering
-  /// the output files
-  void printVTKoutlines(const int step) {
-    // define grid points, i.e. vertices of local domain
-    auto points = vtkSmartPointer<vtkPoints>::New();
-    // allocate space for eight points (describing the cuboid around the domain)
-    points->Allocate(8 * balancer->getDimension());
-
-    int n_ranks;
-    int local_rank;
-
-    if (!vtk_init) {
-      controller = vtkMPIController::New();
-      controller->Initialize();
-      controller->SetGlobalController(controller);
-      vtk_init = true;
+    /// method to provide the current load-balancing efficiency
+    /// @result the current LB efficiency (only valid before balance() was called)
+    W getEfficiency()
+    {
+      return balancer->getEfficiency();
     }
 
-    MPI_Comm_rank(communicator, &local_rank);
-    MPI_Comm_size(communicator, &n_ranks);
-
-    std::vector<Point<W>> tmp_outline(2);
-    std::vector<W> tmp_0(
-        {outline.at(0)[0], outline.at(0)[1], outline.at(0)[2]});
-    std::vector<W> tmp_1(
-        {outline.at(1)[0], outline.at(1)[1], outline.at(1)[2]});
-    tmp_outline.at(0) = Point<W>(tmp_0);
-    tmp_outline.at(1) = Point<W>(tmp_1);
-
-    for (auto z = 0; z <= 1; ++z)
-      for (auto y = 0; y <= 1; ++y)
-        for (auto x = 0; x <= 1; ++x) {
-          points->InsertPoint(x + 2 * y + 4 * z, tmp_outline.at(x)[0],
-                              tmp_outline.at(y)[1], tmp_outline.at(z)[2]);
-        }
+    /// method to provide an estimated work efficiency after the balancing
+    /// @result the estimated LB efficieny (only valid after balance() was called)
+    W getEstimatedEfficiency()
+    {
+      // currently only implemented for HISTOGRAM
+      return balancer->getEstimatedEfficiency();
+    }
 
-    auto hexa = vtkSmartPointer<vtkVoxel>::New();
-    for (int i = 0; i < 8; ++i)
-      hexa->GetPointIds()->SetId(i, i);
-
-    auto cellArray = vtkSmartPointer<vtkCellArray>::New();
-    cellArray->InsertNextCell(hexa);
-
-    // define work array (length = 1)
-    auto work = vtkSmartPointer<vtkFloatArray>::New();
-    work->SetNumberOfComponents(1);
-    work->SetNumberOfTuples(1);
-    work->SetName("work");
-    W total_work = std::accumulate(balancer->getWork().begin(),
-                                   balancer->getWork().end(), (W)0);
-    work->SetValue(0, total_work);
-
-    // define rank array (length = 4, x,y,z, rank)
-    int rank = 0;
-    MPI_Comm_rank(communicator, &rank);
-    auto coords = vtkSmartPointer<vtkFloatArray>::New();
-    coords->SetNumberOfComponents(3);
-    coords->SetNumberOfTuples(1);
-    coords->SetName("coords");
-    coords->SetValue(0, procGridLoc.at(0));
-    coords->SetValue(1, procGridLoc.at(1));
-    coords->SetValue(2, procGridLoc.at(2));
-
-    auto rnk = vtkSmartPointer<vtkFloatArray>::New();
-    rnk->SetNumberOfComponents(1);
-    rnk->SetNumberOfTuples(1);
-    rnk->SetName("MPI rank");
-    rnk->SetValue(0, rank);
-
-    // define tag array (length = 1)
-    auto tag = vtkSmartPointer<vtkIntArray>::New();
-    tag->SetNumberOfComponents(1);
-    tag->SetNumberOfTuples(1);
-    tag->SetName("tag");
-    tag->SetValue(0, procTag);
-
-    // determine extent of system
-    W known_unused global_extent[6];
-    W local_min[3];
-    W local_max[3];
-    W global_min[3];
-    W global_max[3];
-    W width[3];
-    W volume = (W)1.0;
-
-    for (int i = 0; i < 3; ++i) {
-      local_min[i] = outline.at(0)[i];
-      local_max[i] = outline.at(1)[i];
-      width[i] = local_max[i] - local_min[i];
-      volume *= width[i];
+    /**********************/
+    /*  setter functions  */
+    /**********************/
+
+    /// method to set the size of the system, e.g. required for the HISTOGRAM
+    /// balancing method
+    /// @param[in] sysSize reference to a vector of T containing the size of
+    /// the orthogonal system in the following format: (x_min, x_max, y_min,
+    /// y_max, z_min, z_max)
+    void setSysSize(const std::vector<T> &sysSize)
+    {
+      balancer.get()->setSysSize(sysSize);
     }
 
-    W surface = (W)2.0 * width[0] * width[1] + (W)2.0 * width[1] * width[2] +
-                (W)2.0 * width[0] * width[2];
-
-    auto expanse = vtkSmartPointer<vtkFloatArray>::New();
-    expanse->SetNumberOfComponents(6);
-    expanse->SetNumberOfTuples(1);
-    expanse->SetName("expanse");
-    expanse->SetValue(0, width[0]);
-    expanse->SetValue(1, width[0]);
-    expanse->SetValue(2, width[0]);
-    expanse->SetValue(3, volume);
-    expanse->SetValue(4, surface);
-    expanse->SetValue(5, surface / volume);
-
-    MPI_Allreduce(&local_min, &global_min, 3, MPIDataTypeW, MPI_MIN,
-                  communicator);
-    MPI_Allreduce(&local_max, &global_max, 3, MPIDataTypeW, MPI_MAX,
-                  communicator);
-
-    for (int i = 0; i < 3; ++i) {
-      global_extent[2 * i] = global_min[i];
-      global_extent[2 * i + 1] = global_max[i];
+    /// method to set method specific data, that is not required by all different
+    /// balancing methods
+    /// @param[in] data pointer to a struct or other data object in the format the
+    /// methods requires
+    ///
+    /// For the histogram method the number of bins per dimensions can be given
+    /// as a C array of `int`s. The length of the array must be the number of
+    /// dimensions given to the load balancer. BE CAREFUL this is not enforced!
+    void setMethodData(const void *data)
+    {
+      balancer.get()->setAdditionalData(data);
     }
 
-    // create a structured grid and assign data to it
-    auto unstructuredGrid = vtkSmartPointer<vtkUnstructuredGrid>::New();
-    unstructuredGrid->SetPoints(points);
-    unstructuredGrid->SetCells(VTK_VOXEL, cellArray);
-    unstructuredGrid->GetCellData()->AddArray(work);
-    unstructuredGrid->GetCellData()->AddArray(expanse);
-    unstructuredGrid->GetCellData()->AddArray(rnk);
-    unstructuredGrid->GetCellData()->AddArray(coords);
-    unstructuredGrid->GetCellData()->AddArray(tag);
-
-    createDirectory("vtk_outline");
-    std::ostringstream ss_local;
-    ss_local << "vtk_outline/ALL_vtk_outline_" << std::setw(7)
-             << std::setfill('0') << step << "_" << local_rank << ".vtu";
-
-    auto writer = vtkSmartPointer<vtkXMLUnstructuredGridWriter>::New();
-    writer->SetInputData(unstructuredGrid);
-    writer->SetFileName(ss_local.str().c_str());
-    writer->SetDataModeToAscii();
-    // writer->SetDataModeToBinary();
-    writer->Write();
-
-    // if (local_rank == 0)
-    //{
-    std::ostringstream ss_para;
-    ss_para << "vtk_outline/ALL_vtk_outline_" << std::setw(7)
-            << std::setfill('0') << step << ".pvtu";
-    // create the parallel writer
-    auto parallel_writer =
-        vtkSmartPointer<vtkXMLPUnstructuredGridWriter>::New();
-    parallel_writer->SetFileName(ss_para.str().c_str());
-    parallel_writer->SetNumberOfPieces(n_ranks);
-    parallel_writer->SetStartPiece(local_rank);
-    parallel_writer->SetEndPiece(local_rank);
-    parallel_writer->SetInputData(unstructuredGrid);
-    parallel_writer->SetDataModeToAscii();
-    // parallel_writer->SetDataModeToBinary();
-    parallel_writer->Write();
-    //}
-  }
-
-  /// method to create VTK based output of the vertices used in the
-  /// load-balancing process
-  /// @param[in] step the number of the loadbalancing step used for numbering
-  /// the output files
-  void printVTKvertices(const int step) {
-    int n_ranks;
-    int local_rank;
-
-    MPI_Comm_rank(communicator, &local_rank);
-    MPI_Comm_size(communicator, &n_ranks);
-
-    // local vertices
-    // (vertices + work)
-    T local_vertices[nVertices * balancer->getDimension() + 1];
-
-    for (int v = 0; v < nVertices; ++v) {
-      for (int d = 0; d < balancer->getDimension(); ++d) {
-        local_vertices[v * balancer->getDimension() + d] =
-            balancer->getVertices().at(v)[d];
-      }
+    /// method to set the parameters of the used cartesian communicator
+    /// @param[in] loc the cartesian coordinates of the local domain in the
+    /// process grid
+    /// @param[in] size the size of the cartesian process grid
+    void setProcGridParams(const std::vector<int> &loc,
+                           const std::vector<int> &size)
+    {
+      // todo(s.schulz): We should probably throw if the dimension does not match
+      procGridLoc.insert(procGridLoc.begin(), loc.begin(), loc.end());
+      procGridDim.insert(procGridDim.begin(), size.begin(), size.end());
+      procGridSet = true;
     }
-    local_vertices[nVertices * balancer->getDimension()] =
-        (T)balancer->getWork().at(0);
-
-    /*
-    T *global_vertices;
-    if (local_rank == 0) {
-      global_vertices =
-          new T[n_ranks * (nVertices * balancer->getDimension() + 1)];
+
+    /// method to set the process tag output in VTK output
+    /// @param[in] tag
+    void setProcTag(int tag) { procTag = tag; }
+
+    /// method the set the minimum domain sizes in all directions, can be required
+    /// if using linked cell algorithms and wanting to prevent data exchange with
+    /// next-near neighbors instead of only with next neighbors
+    /// @param[in] minSize the minimum size of a domain in each dimension
+    void setMinDomainSize(const std::vector<T> &minSize)
+    {
+      (balancer.get())->setMinDomainSize(minSize);
     }
-    */
-    T global_vertices[n_ranks * (nVertices * balancer->getDimension() + 1)];
 
-    // collect all works and vertices on a single process
-    MPI_Gather(local_vertices, nVertices * balancer->getDimension() + 1,
-               MPIDataTypeT, global_vertices,
-               nVertices * balancer->getDimension() + 1, MPIDataTypeT, 0,
-               communicator);
+#ifdef ALL_VTK_OUTPUT
+    /// method to create VTK based output of the domains used in the
+    /// load-balancing process (for now only orthogonal domains are supported)
+    /// @param[in] step the number of the loadbalancing step used for numbering
+    /// the output files
+    void printVTKoutlines(const int step)
+    {
+      // define grid points, i.e. vertices of local domain
+      auto points = vtkSmartPointer<vtkPoints>::New();
+      // allocate space for eight points (describing the cuboid around the domain)
+      points->Allocate(8 * balancer->getDimension());
 
-    if (local_rank == 0) {
-      auto vtkpoints = vtkSmartPointer<vtkPoints>::New();
-#ifdef VTK_CELL_ARRAY_V2
-      vtkNew<vtkUnstructuredGrid> unstructuredGrid;
-      unstructuredGrid->Allocate(n_ranks + 10);
-#else
-      auto unstructuredGrid = vtkSmartPointer<vtkUnstructuredGrid>::New();
-#endif
+      int n_ranks;
+      int local_rank;
 
-      // enter vertices into unstructured grid
-      for (int i = 0; i < n_ranks; ++i) {
-        for (int v = 0; v < nVertices; ++v) {
-          vtkpoints->InsertNextPoint(
-              global_vertices[i * (nVertices * balancer->getDimension() + 1) +
-                              v * balancer->getDimension() + 0],
-              global_vertices[i * (nVertices * balancer->getDimension() + 1) +
-                              v * balancer->getDimension() + 1],
-              global_vertices[i * (nVertices * balancer->getDimension() + 1) +
-                              v * balancer->getDimension() + 2]);
-        }
+      if (!vtk_init)
+      {
+        controller = vtkMPIController::New();
+        controller->Initialize();
+        controller->SetGlobalController(controller);
+        vtk_init = true;
       }
-      unstructuredGrid->SetPoints(vtkpoints);
 
-      // data arrays for work and cell id
+      MPI_Comm_rank(communicator, &local_rank);
+      MPI_Comm_size(communicator, &n_ranks);
+
+      std::vector<Point<W>> tmp_outline(2);
+      std::vector<W> tmp_0(
+          {outline.at(0)[0], outline.at(0)[1], outline.at(0)[2]});
+      std::vector<W> tmp_1(
+          {outline.at(1)[0], outline.at(1)[1], outline.at(1)[2]});
+      tmp_outline.at(0) = Point<W>(tmp_0);
+      tmp_outline.at(1) = Point<W>(tmp_1);
+
+      for (auto z = 0; z <= 1; ++z)
+        for (auto y = 0; y <= 1; ++y)
+          for (auto x = 0; x <= 1; ++x)
+          {
+            points->InsertPoint(x + 2 * y + 4 * z, tmp_outline.at(x)[0],
+                                tmp_outline.at(y)[1], tmp_outline.at(z)[2]);
+          }
+
+      auto hexa = vtkSmartPointer<vtkVoxel>::New();
+      for (int i = 0; i < 8; ++i)
+        hexa->GetPointIds()->SetId(i, i);
+
+      auto cellArray = vtkSmartPointer<vtkCellArray>::New();
+      cellArray->InsertNextCell(hexa);
+
+      // define work array (length = 1)
       auto work = vtkSmartPointer<vtkFloatArray>::New();
-      auto cell = vtkSmartPointer<vtkFloatArray>::New();
       work->SetNumberOfComponents(1);
-      work->SetNumberOfTuples(n_ranks);
+      work->SetNumberOfTuples(1);
       work->SetName("work");
-      cell->SetNumberOfComponents(1);
-      cell->SetNumberOfTuples(n_ranks);
-      cell->SetName("cell id");
-
-      for (int n = 0; n < n_ranks; ++n) {
+      W total_work = std::accumulate(balancer->getWork().begin(),
+                                     balancer->getWork().end(), (W)0);
+      work->SetValue(0, total_work);
+
+      // define rank array (length = 4, x,y,z, rank)
+      int rank = 0;
+      MPI_Comm_rank(communicator, &rank);
+      auto coords = vtkSmartPointer<vtkFloatArray>::New();
+      coords->SetNumberOfComponents(3);
+      coords->SetNumberOfTuples(1);
+      coords->SetName("coords");
+      coords->SetValue(0, procGridLoc.at(0));
+      coords->SetValue(1, procGridLoc.at(1));
+      coords->SetValue(2, procGridLoc.at(2));
+
+      auto rnk = vtkSmartPointer<vtkFloatArray>::New();
+      rnk->SetNumberOfComponents(1);
+      rnk->SetNumberOfTuples(1);
+      rnk->SetName("MPI rank");
+      rnk->SetValue(0, rank);
+
+      // define tag array (length = 1)
+      auto tag = vtkSmartPointer<vtkIntArray>::New();
+      tag->SetNumberOfComponents(1);
+      tag->SetNumberOfTuples(1);
+      tag->SetName("tag");
+      tag->SetValue(0, procTag);
+
+      // determine extent of system
+      W known_unused global_extent[6];
+      W local_min[3];
+      W local_max[3];
+      W global_min[3];
+      W global_max[3];
+      W width[3];
+      W volume = (W)1.0;
+
+      for (int i = 0; i < 3; ++i)
+      {
+        local_min[i] = outline.at(0)[i];
+        local_max[i] = outline.at(1)[i];
+        width[i] = local_max[i] - local_min[i];
+        volume *= width[i];
+      }
 
-#ifdef VTK_CELL_ARRAY_V2
-        // define grid points, i.e. vertices of local domain
-        vtkIdType pointIds[8] = {8 * n + 0, 8 * n + 1, 8 * n + 2, 8 * n + 3,
-                                 8 * n + 4, 8 * n + 5, 8 * n + 6, 8 * n + 7};
-
-        vtkIdType faces[48] = {
-            3,         8 * n + 0, 8 * n + 2, 8 * n + 1, 
-            3,         8 * n + 1, 8 * n + 2, 8 * n + 3, 
-            3,         8 * n + 0, 8 * n + 4, 8 * n + 2, 
-            3,         8 * n + 2, 8 * n + 4, 8 * n + 6, 
-            3,         8 * n + 2, 8 * n + 6, 8 * n + 3, 
-            3,         8 * n + 3, 8 * n + 6, 8 * n + 7, 
-            3,         8 * n + 1, 8 * n + 5, 8 * n + 3, 
-            3,         8 * n + 3, 8 * n + 5, 8 * n + 7, 
-            3,         8 * n + 0, 8 * n + 4, 8 * n + 1, 
-            3,         8 * n + 1, 8 * n + 4, 8 * n + 5, 
-            3,         8 * n + 4, 8 * n + 6, 8 * n + 5, 
-            3,         8 * n + 5, 8 * n + 6, 8 * n + 7};
-
-        unstructuredGrid->InsertNextCell(VTK_POLYHEDRON, 8, pointIds, 12,
-                                         faces);
-#else
-        // define grid points, i.e. vertices of local domain
-        vtkIdType pointIds[8] = {8 * n + 0, 8 * n + 1, 8 * n + 2, 8 * n + 3,
-                                 8 * n + 4, 8 * n + 5, 8 * n + 6, 8 * n + 7};
-
-        auto faces = vtkSmartPointer<vtkCellArray>::New();
-        // setup faces of polyhedron
-        vtkIdType f0[3] = {8 * n + 0, 8 * n + 2, 8 * n + 1};
-        vtkIdType f1[3] = {8 * n + 1, 8 * n + 2, 8 * n + 3};
-
-        vtkIdType f2[3] = {8 * n + 0, 8 * n + 4, 8 * n + 2};
-        vtkIdType f3[3] = {8 * n + 2, 8 * n + 4, 8 * n + 6};
-
-        vtkIdType f4[3] = {8 * n + 2, 8 * n + 6, 8 * n + 3};
-        vtkIdType f5[3] = {8 * n + 3, 8 * n + 6, 8 * n + 7};
-
-        vtkIdType f6[3] = {8 * n + 1, 8 * n + 5, 8 * n + 3};
-        vtkIdType f7[3] = {8 * n + 3, 8 * n + 5, 8 * n + 7};
-
-        vtkIdType f8[3] = {8 * n + 0, 8 * n + 4, 8 * n + 1};
-        vtkIdType f9[3] = {8 * n + 1, 8 * n + 4, 8 * n + 5};
-
-        vtkIdType fa[3] = {8 * n + 4, 8 * n + 6, 8 * n + 5};
-        vtkIdType fb[3] = {8 * n + 5, 8 * n + 6, 8 * n + 7};
-
-        faces->InsertNextCell(3, f0);
-        faces->InsertNextCell(3, f1);
-        faces->InsertNextCell(3, f2);
-        faces->InsertNextCell(3, f3);
-        faces->InsertNextCell(3, f4);
-        faces->InsertNextCell(3, f5);
-        faces->InsertNextCell(3, f6);
-        faces->InsertNextCell(3, f7);
-        faces->InsertNextCell(3, f8);
-        faces->InsertNextCell(3, f9);
-        faces->InsertNextCell(3, fa);
-        faces->InsertNextCell(3, fb);
-
-        unstructuredGrid->InsertNextCell(VTK_POLYHEDRON, 8, pointIds, 12,
-                                         faces->GetPointer());
-#endif
-        work->SetValue(
-            n, global_vertices[n * (nVertices * balancer->getDimension() + 1) +
-                               8 * balancer->getDimension()]);
-        cell->SetValue(n, (T)n);
+      W surface = (W)2.0 * width[0] * width[1] + (W)2.0 * width[1] * width[2] +
+                  (W)2.0 * width[0] * width[2];
+
+      auto expanse = vtkSmartPointer<vtkFloatArray>::New();
+      expanse->SetNumberOfComponents(6);
+      expanse->SetNumberOfTuples(1);
+      expanse->SetName("expanse");
+      expanse->SetValue(0, width[0]);
+      expanse->SetValue(1, width[0]);
+      expanse->SetValue(2, width[0]);
+      expanse->SetValue(3, volume);
+      expanse->SetValue(4, surface);
+      expanse->SetValue(5, surface / volume);
+
+      MPI_Allreduce(&local_min, &global_min, 3, MPIDataTypeW, MPI_MIN,
+                    communicator);
+      MPI_Allreduce(&local_max, &global_max, 3, MPIDataTypeW, MPI_MAX,
+                    communicator);
+
+      for (int i = 0; i < 3; ++i)
+      {
+        global_extent[2 * i] = global_min[i];
+        global_extent[2 * i + 1] = global_max[i];
       }
+
+      // create a structured grid and assign data to it
+      auto unstructuredGrid = vtkSmartPointer<vtkUnstructuredGrid>::New();
+      unstructuredGrid->SetPoints(points);
+      unstructuredGrid->SetCells(VTK_VOXEL, cellArray);
       unstructuredGrid->GetCellData()->AddArray(work);
-      unstructuredGrid->GetCellData()->AddArray(cell);
+      unstructuredGrid->GetCellData()->AddArray(expanse);
+      unstructuredGrid->GetCellData()->AddArray(rnk);
+      unstructuredGrid->GetCellData()->AddArray(coords);
+      unstructuredGrid->GetCellData()->AddArray(tag);
+
+      createDirectory("vtk_outline");
+      std::ostringstream ss_local;
+      ss_local << "vtk_outline/ALL_vtk_outline_" << std::setw(7)
+               << std::setfill('0') << step << "_" << local_rank << ".vtu";
 
-      createDirectory("vtk_vertices");
-      std::ostringstream filename;
-      filename << "vtk_vertices/ALL_vtk_vertices_" << std::setw(7)
-               << std::setfill('0') << step << ".vtu";
       auto writer = vtkSmartPointer<vtkXMLUnstructuredGridWriter>::New();
       writer->SetInputData(unstructuredGrid);
-      writer->SetFileName(filename.str().c_str());
+      writer->SetFileName(ss_local.str().c_str());
       writer->SetDataModeToAscii();
       // writer->SetDataModeToBinary();
       writer->Write();
 
-      // delete[] global_vertices;
+      // if (local_rank == 0)
+      //{
+      std::ostringstream ss_para;
+      ss_para << "vtk_outline/ALL_vtk_outline_" << std::setw(7)
+              << std::setfill('0') << step << ".pvtu";
+      // create the parallel writer
+      auto parallel_writer =
+          vtkSmartPointer<vtkXMLPUnstructuredGridWriter>::New();
+      parallel_writer->SetFileName(ss_para.str().c_str());
+      parallel_writer->SetNumberOfPieces(n_ranks);
+      parallel_writer->SetStartPiece(local_rank);
+      parallel_writer->SetEndPiece(local_rank);
+      parallel_writer->SetInputData(unstructuredGrid);
+      parallel_writer->SetDataModeToAscii();
+      // parallel_writer->SetDataModeToBinary();
+      parallel_writer->Write();
+      //}
     }
-  }
-#endif
 
-private:
-  /// the used balancing method
-  LB_t method;
-
-  /// outer cuboid encasing the domain (for cuboid domains identical to domain)
-  /// defined by front left lower [0][*] and back upper right points [1][*]
-  /// where * is 0,...,dim-1
-  std::vector<Point<T>> outline;
-
-  /// the interal balancer object
-  std::unique_ptr<LB<T, W>> balancer;
-
-  /// storage of local work (scalar work is stored in the first entry of the
-  /// vector)
-  std::vector<W> workArray;
-
-  /// internal MPI communicator used in library
-  MPI_Comm communicator;
-
-  /// number of vertices (for non-cuboid domains)
-  int nVertices;
-
-  /// number of neighbors
-  int nNeighbors;
-
-  /// calculate the outline of the domain using the vertices, i.e. the smallest
-  /// orthogonal domain containing the vertices used
-  void calculate_outline() {
-    // calculation only possible if there are vertices defining the domain
-    if (balancer->getVertices().size() > 0) {
-      outline.resize(2);
-      // setup the outline with the first point
-      for (int i = 0; i < balancer->getDimension(); ++i) {
-        outline.at(0) = balancer->getVertices().at(0);
-        outline.at(1) = balancer->getVertices().at(0);
+    /// method to create VTK based output of the vertices used in the
+    /// load-balancing process
+    /// @param[in] step the number of the loadbalancing step used for numbering
+    /// the output files
+    void printVTKvertices(const int step)
+    {
+      int n_ranks;
+      int local_rank;
+
+      MPI_Comm_rank(communicator, &local_rank);
+      MPI_Comm_size(communicator, &n_ranks);
+
+      // local vertices
+      // (vertices + work)
+      T local_vertices[nVertices * balancer->getDimension() + 1];
+
+      for (int v = 0; v < nVertices; ++v)
+      {
+        for (int d = 0; d < balancer->getDimension(); ++d)
+        {
+          local_vertices[v * balancer->getDimension() + d] =
+              balancer->getVertices().at(v)[d];
+        }
       }
-      // compare value of each outline point with all vertices to find the
-      // maximum extend of the domain in each dimension
-      for (int i = 1; i < (int)balancer->getVertices().size(); ++i) {
-        Point<T> p = balancer->getVertices().at(i);
-        for (int j = 0; j < balancer->getDimension(); ++j) {
-          outline.at(0)[j] = std::min(outline.at(0)[j], p[j]);
-          outline.at(1)[j] = std::max(outline.at(1)[j], p[j]);
+      local_vertices[nVertices * balancer->getDimension()] =
+          (T)balancer->getWork().at(0);
+
+      /*
+      T *global_vertices;
+      if (local_rank == 0) {
+        global_vertices =
+            new T[n_ranks * (nVertices * balancer->getDimension() + 1)];
+      }
+      */
+      T global_vertices[n_ranks * (nVertices * balancer->getDimension() + 1)];
+
+      // collect all works and vertices on a single process
+      MPI_Gather(local_vertices, nVertices * balancer->getDimension() + 1,
+                 MPIDataTypeT, global_vertices,
+                 nVertices * balancer->getDimension() + 1, MPIDataTypeT, 0,
+                 communicator);
+
+      if (local_rank == 0)
+      {
+        auto vtkpoints = vtkSmartPointer<vtkPoints>::New();
+#ifdef VTK_CELL_ARRAY_V2
+        vtkNew<vtkUnstructuredGrid> unstructuredGrid;
+        unstructuredGrid->Allocate(n_ranks + 10);
+#else
+        auto unstructuredGrid = vtkSmartPointer<vtkUnstructuredGrid>::New();
+#endif
+
+        // enter vertices into unstructured grid
+        for (int i = 0; i < n_ranks; ++i)
+        {
+          for (int v = 0; v < nVertices; ++v)
+          {
+            vtkpoints->InsertNextPoint(
+                global_vertices[i * (nVertices * balancer->getDimension() + 1) +
+                                v * balancer->getDimension() + 0],
+                global_vertices[i * (nVertices * balancer->getDimension() + 1) +
+                                v * balancer->getDimension() + 1],
+                global_vertices[i * (nVertices * balancer->getDimension() + 1) +
+                                v * balancer->getDimension() + 2]);
+          }
         }
+        unstructuredGrid->SetPoints(vtkpoints);
+
+        // data arrays for work and cell id
+        auto work = vtkSmartPointer<vtkFloatArray>::New();
+        auto cell = vtkSmartPointer<vtkFloatArray>::New();
+        work->SetNumberOfComponents(1);
+        work->SetNumberOfTuples(n_ranks);
+        work->SetName("work");
+        cell->SetNumberOfComponents(1);
+        cell->SetNumberOfTuples(n_ranks);
+        cell->SetName("cell id");
+
+        for (int n = 0; n < n_ranks; ++n)
+        {
+
+#ifdef VTK_CELL_ARRAY_V2
+          // define grid points, i.e. vertices of local domain
+          vtkIdType pointIds[8] = {8 * n + 0, 8 * n + 1, 8 * n + 2, 8 * n + 3,
+                                   8 * n + 4, 8 * n + 5, 8 * n + 6, 8 * n + 7};
+
+          vtkIdType faces[48] = {
+              3, 8 * n + 0, 8 * n + 2, 8 * n + 1,
+              3, 8 * n + 1, 8 * n + 2, 8 * n + 3,
+              3, 8 * n + 0, 8 * n + 4, 8 * n + 2,
+              3, 8 * n + 2, 8 * n + 4, 8 * n + 6,
+              3, 8 * n + 2, 8 * n + 6, 8 * n + 3,
+              3, 8 * n + 3, 8 * n + 6, 8 * n + 7,
+              3, 8 * n + 1, 8 * n + 5, 8 * n + 3,
+              3, 8 * n + 3, 8 * n + 5, 8 * n + 7,
+              3, 8 * n + 0, 8 * n + 4, 8 * n + 1,
+              3, 8 * n + 1, 8 * n + 4, 8 * n + 5,
+              3, 8 * n + 4, 8 * n + 6, 8 * n + 5,
+              3, 8 * n + 5, 8 * n + 6, 8 * n + 7};
+
+          unstructuredGrid->InsertNextCell(VTK_POLYHEDRON, 8, pointIds, 12,
+                                           faces);
+#else
+          // define grid points, i.e. vertices of local domain
+          vtkIdType pointIds[8] = {8 * n + 0, 8 * n + 1, 8 * n + 2, 8 * n + 3,
+                                   8 * n + 4, 8 * n + 5, 8 * n + 6, 8 * n + 7};
+
+          auto faces = vtkSmartPointer<vtkCellArray>::New();
+          // setup faces of polyhedron
+          vtkIdType f0[3] = {8 * n + 0, 8 * n + 2, 8 * n + 1};
+          vtkIdType f1[3] = {8 * n + 1, 8 * n + 2, 8 * n + 3};
+
+          vtkIdType f2[3] = {8 * n + 0, 8 * n + 4, 8 * n + 2};
+          vtkIdType f3[3] = {8 * n + 2, 8 * n + 4, 8 * n + 6};
+
+          vtkIdType f4[3] = {8 * n + 2, 8 * n + 6, 8 * n + 3};
+          vtkIdType f5[3] = {8 * n + 3, 8 * n + 6, 8 * n + 7};
+
+          vtkIdType f6[3] = {8 * n + 1, 8 * n + 5, 8 * n + 3};
+          vtkIdType f7[3] = {8 * n + 3, 8 * n + 5, 8 * n + 7};
+
+          vtkIdType f8[3] = {8 * n + 0, 8 * n + 4, 8 * n + 1};
+          vtkIdType f9[3] = {8 * n + 1, 8 * n + 4, 8 * n + 5};
+
+          vtkIdType fa[3] = {8 * n + 4, 8 * n + 6, 8 * n + 5};
+          vtkIdType fb[3] = {8 * n + 5, 8 * n + 6, 8 * n + 7};
+
+          faces->InsertNextCell(3, f0);
+          faces->InsertNextCell(3, f1);
+          faces->InsertNextCell(3, f2);
+          faces->InsertNextCell(3, f3);
+          faces->InsertNextCell(3, f4);
+          faces->InsertNextCell(3, f5);
+          faces->InsertNextCell(3, f6);
+          faces->InsertNextCell(3, f7);
+          faces->InsertNextCell(3, f8);
+          faces->InsertNextCell(3, f9);
+          faces->InsertNextCell(3, fa);
+          faces->InsertNextCell(3, fb);
+
+          unstructuredGrid->InsertNextCell(VTK_POLYHEDRON, 8, pointIds, 12,
+                                           faces->GetPointer());
+#endif
+          work->SetValue(
+              n, global_vertices[n * (nVertices * balancer->getDimension() + 1) +
+                                 8 * balancer->getDimension()]);
+          cell->SetValue(n, (T)n);
+        }
+        unstructuredGrid->GetCellData()->AddArray(work);
+        unstructuredGrid->GetCellData()->AddArray(cell);
+
+        createDirectory("vtk_vertices");
+        std::ostringstream filename;
+        filename << "vtk_vertices/ALL_vtk_vertices_" << std::setw(7)
+                 << std::setfill('0') << step << ".vtu";
+        auto writer = vtkSmartPointer<vtkXMLUnstructuredGridWriter>::New();
+        writer->SetInputData(unstructuredGrid);
+        writer->SetFileName(filename.str().c_str());
+        writer->SetDataModeToAscii();
+        // writer->SetDataModeToBinary();
+        writer->Write();
+
+        // delete[] global_vertices;
       }
     }
-  }
-
-  /// create directory if it does not already exist
-  ///
-  /// May throw FilesystemErrorException
-  void createDirectory(const char *filename) {
-    errno = 0;
-    mode_t mode = S_IRWXU | S_IRWXG; // rwx for user and group
-    if (mkdir(filename, mode)) {
-      if (errno != EEXIST) {
-        throw FilesystemErrorException(__FILE__, __func__, __LINE__,
-                                       "Could not create output directory.");
+#endif
+
+  private:
+    /// the used balancing method
+    LB_t method;
+
+    /// outer cuboid encasing the domain (for cuboid domains identical to domain)
+    /// defined by front left lower [0][*] and back upper right points [1][*]
+    /// where * is 0,...,dim-1
+    std::vector<Point<T>> outline;
+
+    /// the interal balancer object
+    std::unique_ptr<LB<T, W>> balancer;
+
+    /// storage of local work (scalar work is stored in the first entry of the
+    /// vector)
+    std::vector<W> workArray;
+
+    /// internal MPI communicator used in library
+    MPI_Comm communicator;
+
+    /// number of vertices (for non-cuboid domains)
+    int nVertices;
+
+    /// number of neighbors
+    int nNeighbors;
+
+    /// calculate the outline of the domain using the vertices, i.e. the smallest
+    /// orthogonal domain containing the vertices used
+    void calculate_outline()
+    {
+      // calculation only possible if there are vertices defining the domain
+      if (balancer->getVertices().size() > 0)
+      {
+        outline.resize(2);
+        // setup the outline with the first point
+        for (int i = 0; i < balancer->getDimension(); ++i)
+        {
+          outline.at(0) = balancer->getVertices().at(0);
+          outline.at(1) = balancer->getVertices().at(0);
+        }
+        // compare value of each outline point with all vertices to find the
+        // maximum extend of the domain in each dimension
+        for (int i = 1; i < (int)balancer->getVertices().size(); ++i)
+        {
+          Point<T> p = balancer->getVertices().at(i);
+          for (int j = 0; j < balancer->getDimension(); ++j)
+          {
+            outline.at(0)[j] = std::min(outline.at(0)[j], p[j]);
+            outline.at(1)[j] = std::max(outline.at(1)[j], p[j]);
+          }
+        }
       }
     }
-    // check permission in case directory already existed, but had wrong
-    // permissions
-    struct stat attr;
-    stat(filename, &attr);
-    if ((attr.st_mode & S_IRWXU) != S_IRWXU) {
-      throw FilesystemErrorException(
-          __FILE__, __func__, __LINE__,
-          "Possibly already existing directory does not have correct "
-          "permissions (rwx) for user");
+
+    /// create directory if it does not already exist
+    ///
+    /// May throw FilesystemErrorException
+    void createDirectory(const char *filename)
+    {
+      errno = 0;
+      mode_t mode = S_IRWXU | S_IRWXG; // rwx for user and group
+      if (mkdir(filename, mode))
+      {
+        if (errno != EEXIST)
+        {
+          throw FilesystemErrorException(__FILE__, __func__, __LINE__,
+                                         "Could not create output directory.");
+        }
+      }
+      // check permission in case directory already existed, but had wrong
+      // permissions
+      struct stat attr;
+      stat(filename, &attr);
+      if ((attr.st_mode & S_IRWXU) != S_IRWXU)
+      {
+        throw FilesystemErrorException(
+            __FILE__, __func__, __LINE__,
+            "Possibly already existing directory does not have correct "
+            "permissions (rwx) for user");
+      }
     }
-  }
 
-  /// limit for the estimation procedure, which is an experimental way to use
-  /// non-updated work-loads to provide better results for the final vertex
-  /// positions
-  int estimation_limit;
+    /// limit for the estimation procedure, which is an experimental way to use
+    /// non-updated work-loads to provide better results for the final vertex
+    /// positions
+    int estimation_limit;
 
-  /// cartesian coordinates of the local domain in the process grid
-  std::vector<int> procGridLoc;
-  /// size of the process grid
-  std::vector<int> procGridDim;
-  /// check if the process grid data is set
-  bool procGridSet;
-  /// tag of the process
-  int procTag;
+    /// cartesian coordinates of the local domain in the process grid
+    std::vector<int> procGridLoc;
+    /// size of the process grid
+    std::vector<int> procGridDim;
+    /// check if the process grid data is set
+    bool procGridSet;
+    /// tag of the process
+    int procTag;
 
-  /// datatype for MPI to use when communicating vertex related data
-  MPI_Datatype MPIDataTypeT;
+    /// datatype for MPI to use when communicating vertex related data
+    MPI_Datatype MPIDataTypeT;
 
-  /// datatype for MPI to use when communicating work related data
-  MPI_Datatype MPIDataTypeW;
+    /// datatype for MPI to use when communicating work related data
+    MPI_Datatype MPIDataTypeW;
 
-  /// counter for the number of loadbalancing steps already performed
-  int loadbalancing_step;
+    /// counter for the number of loadbalancing steps already performed
+    int loadbalancing_step;
 
 #ifdef ALL_VTK_OUTPUT
-  /// check if the initialization of parallel VTK is already performed
-  bool vtk_init;
+    /// check if the initialization of parallel VTK is already performed
+    bool vtk_init;
 
-  /// the parallel VTK controller for the domain output
-  vtkMPIController *controller;
+    /// the parallel VTK controller for the domain output
+    vtkMPIController *controller;
 
-  /// the parallel VTK controller for the vertex output
-  vtkMPIController *controller_vertices;
+    /// the parallel VTK controller for the vertex output
+    vtkMPIController *controller_vertices;
 #endif
-};
+  };
 
 } // namespace ALL
 
diff --git a/include/ALL_Iterative.hpp b/include/ALL_Iterative.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1c348811fe17a673b276a0a7aab6b1aa81b620a4
--- /dev/null
+++ b/include/ALL_Iterative.hpp
@@ -0,0 +1,491 @@
+/*
+Copyright 2018-2025 Rene Halver, Forschungszentrum Juelich GmbH, Germany
+Copyright 2018-2020 Godehard Sutmann, Forschungszentrum Juelich GmbH, Germany
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation and/or
+   other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef ALL_ITERATIVE_HEADER_INCLUDED
+#define ALL_ITERATIVE_HEADER_INCLUDED
+
+#include "ALL_CustomExceptions.hpp"
+#include "ALL_Defines.h"
+#include "ALL_Functions.hpp"
+#include "ALL_LB.hpp"
+#include "ALL_Point.hpp"
+#include <cassert>
+#include <exception>
+#include <mpi.h>
+#include <numeric>
+
+#ifndef m_assert
+#define m_assert(expr, msg) assert(((void)(msg), (expr)))
+#endif
+
+namespace ALL
+{
+   template <class T>
+   class IterativeData
+   {
+      IterativeData(std::vector<Point<T>> &b, Point<T> &a, Point<int> &loc, Point<int> &o) : base(b), anchor(a), location(loc), order(o)
+      {
+         // check if lengths of all arguments match
+         m_assert("Size of base vector set must be equal to dimension of location vector.", b.size() == loc.getDimension());
+         m_assert("Size of base vector set must be equal to dimension of anchor point.", b.size() == a.getDimension());
+         m_assert("Size of base vector set must be equal to size of order vector.", b.size() == o.getDimension());
+      }
+
+   private:
+      Point<T> anchor;
+      std::vector<Point<T>> base;
+      std::vector<int> location;
+      std::vector<int> order;
+   };
+
+   /// Enum class to allow the selection of the scheme when using the iterative
+   /// unified solver
+   enum class ALL_ITERATIVE_SCHEME
+   {
+      STAGGERED,
+      TENSOR,
+      TENSOR_MAX
+   };
+
+   /// Class to provide all load-balancing schemes available in ALL, that are based
+   /// on a iterative approach (TENSOR / STAGGERED). Both work by exchanging local
+   /// workload with local neighbors. Local in this case can mean a reduction of
+   /// single domain workload in a sub-communicator of dimension d_sub < d_system.
+   /// Depending on the selected scheme the dimensions of these subcommunicators
+   /// may vary (STAGGERED) or be determined as d-1 (TENSOR).
+   /// Based on these reduced workloads, boundarys of neighbored subpartitions are
+   /// equalized, e.g. thickness of planes of domains.
+   /// TENSOR and TENSOR_MAX differ in the way, reductions in the subpartitions work
+   /// TENSOR uses the sum of domain work loads, while TENSOR_MAX uses the maximum
+   /// domain work in the subpartition as work load
+   /// @tparam T data for vertices and related data
+   /// @tparam W data for work and related data
+   /// @tparam M selected load-balancing mode (TENSOR / STAGGERED)
+   template <class T, class W, ALL_ITERATIVE_SCHEME SCHEME>
+   class Iterative_LB : public LB<T, W>
+   {
+   public:
+      /// default constuctor
+      Iterative_LB() {}
+      /// constructor to initialize values
+      /// @param[in] d dimension of the vertices used
+      /// @param[in] w the local (scalar) work load
+      /// @param[in] g the correction factor
+      Iterative_LB(int d, W w, T g) : LB<T, W>(d, g), communicators(d, MPI_COMM_NULL), nNeighbors(2 * d, -1), topoNeighbors(2 * d, -1)
+      {
+         this->setWork(w);
+         // need the lower and upper bounds
+         // of the domain for the tensor-based
+         // load-balancing scheme
+
+         isDataSet = false;
+      }
+
+      /// default destructor
+      ~Iterative_LB() = default;
+
+      /// setup internal data structures and parameters
+      void setup() override;
+
+      /// method to execute a load-balancing step
+      /// @param[in] step number of the load-balancing step
+      void balance(int step) override;
+
+      // getter for variables (passed by reference to avoid
+      // direct access to private members of the object)
+
+      /// method to provide a list of the neighbors of the local domain
+      /// @param[out] list reference to a std::vector of integers where the list of
+      /// neighbors will be assigned to
+      virtual std::vector<int> &getNeighbors() override;
+
+      /// method to set specific data structures (unused for tensor grid method)
+      /// @param[in] data pointer to the data structure
+      virtual void setAdditionalData(known_unused const void *data) override
+      {
+         // read data from the provided pointer location
+         // TODO: check if there are ways to deal with it in a better way ...
+         anchor = ((IterativeData<T> *)data)->anchor;
+         base = ((IterativeData<T> *)data)->base;
+         location = ((IterativeData<T> *)data)->location;
+         order = ((IterativeData<T> *)data)->order;
+         // determine the maximum extension in each dimension of the process grid
+         MPI_Allreduce(location.data(), limit.data(), this->dimension, MPI_INT, MPI_MAX, this->globalComm);
+
+         isDataSet = true;
+      }
+
+      /// method to execute a load-balancing step
+      /// @param[in] step number of the load-balancing step
+      virtual void balance(int step, MPI_Op reductionMode);
+
+      /// method to get an estimated work distribution after the balance step
+      /// (currently only implemented in ALL::HISTOGRAM!)
+      /// @param [out] double providing the estimated LB after the balance step
+      virtual W getEstimatedEfficiency() override { return (W)-1; };
+
+   private:
+      void createCommunicatorsStaggered();
+      void createCommunicatorsTensor();
+
+      // type for MPI communication
+      MPI_Datatype MPIDataTypeT;
+      MPI_Datatype MPIDataTypeW;
+
+      // array of MPI communicators for each direction (to collect work
+      // on each plane)
+      std::vector<MPI_Comm> communicators;
+
+      // anchor point of domain
+      Point<T> anchor;
+
+      // base vectors
+      std::vector<Point<T>> base;
+
+      // location in topology
+      std::vector<int> location;
+      std::vector<int> limit;
+
+      // order of hierarchy
+      std::vector<int> order;
+
+      // list of topological neighbors (for use in balance routine)
+      std::vector<int> topoNeighbors;
+
+      // list of neighbors (real system)
+      std::vector<int> neighbors;
+      std::vector<int> nNeighbors;
+
+      // bool data set
+      bool isDataSet;
+
+      /// method to provide a list of the neighbors of the local domain (TENSOR-based)
+      /// @param[out] list reference to a std::vector of integers where the list of
+      /// neighbors will be assigned to
+      void findNeighborsTensor();
+
+      /// method to provide a list of the neighbors of the local domain (TENSOR-based)
+      /// @param[out] list reference to a std::vector of integers where the list of
+      /// neighbors will be assigned to
+      void findNeighborsStaggered();
+
+   protected:
+   };
+
+   template <class T, class W, ALL_ITERATIVE_SCHEME SCHEME>
+   void Iterative_LB<T, W, SCHEME>::setup()
+   {
+      // setup communicators based on the load-balancing scheme
+      if (SCHEME == ALL_ITERATIVE_SCHEME::STAGGERED)
+         createCommunicatorsStaggered();
+      else
+         createCommunicatorsTensor();
+
+      // determine real-space neighbors (using overlaps since
+      // boundaries might be staggered)
+      // neighbor search is identical (just some overhead for
+      // TENSOR, as no processes can overlap in the sense of
+      // a staggered grid)
+      getNeighbors();
+
+      // do not need topological neighbors since the communication
+      // in the library will go through rank 0 of the subcommunicators
+      // and broadcasting the work of the neighboring subcommunicator
+   }
+
+   template <class T, class W, ALL_ITERATIVE_SCHEME SCHEME>
+   void Iterative_LB<T, W, SCHEME>::createCommunicatorsStaggered()
+   {
+      m_assert("Additional data for method ITERATIVE needs to be provided.", isDataSet);
+      // iterate through order (currDim)
+      int curr = 0;
+      for (auto d : order)
+      {
+         // location in currDim determine which sub-dimension a domain belongs to (currDim = color)
+         int color = (int)location[d];
+         int key = 0;
+
+         // compute the relative position in the subcommunicator by using a process grid with form
+         // (max n_i, 0 <= i <= d), with n_d being the number of processes in that direction (may vary
+         // in non-cartesian grids, e.g. (2,(4,2),4). In that case use a 2 x 4 x 4 grid to compute
+         // relativ positions -> subcommunicators are always built from the used global communicator,
+         // using all remaining dimensions
+         for (int i = 0; i < this->dimension; ++i)
+         {
+            // compute the key by using the linear index transformation from (d-1)D to 1D,
+            // => index [in dimension d] = location[0] + location[1] * limit[0] + ...
+            if (i == d)
+               continue;
+            int tmp = location[i];
+            for (auto t = 0; t < i; ++t)
+               tmp *= limit[t];
+            key += tmp;
+         }
+
+         // create subcommunicators with MPI_Comm_split
+         if (curr == (this->dimension - 1))
+         {
+            // create subcommunicators with MPI_Comm_split
+            MPI_Comm_split(this->globalComm, color, key, communicators.data() + d);
+         }
+         else
+         {
+            // no communicators for neighbor - neighbor exchange (too many communicators
+            // when using many processes)
+            communicators.at(curr) = MPI_COMM_NULL;
+         }
+         ++curr;
+      }
+   }
+
+   template <class T, class W, ALL_ITERATIVE_SCHEME SCHEME>
+   void Iterative_LB<T, W, SCHEME>::createCommunicatorsTensor()
+   {
+      m_assert("Additional data for method ITERATIVE needs to be provided.", isDataSet);
+      int curr = 0;
+      // iterate through order (currDim)
+      for (auto d : order)
+      {
+         // location in currDim determine which sub-dimension a domain belongs to (currDim = color)
+         int color = (int)location[d];
+         int key = 0;
+
+         // compute the relative position in the subcommunicator by using a process grid with form
+         // (max n_i, 0 <= i <= d), with n_d being the number of processes in that direction (may vary
+         // in non-cartesian grids, e.g. (2,(4,2),4). In that case use a 2 x 4 x 4 grid to compute
+         // relativ positions -> subcommunicators are always built from the last used subcommunicator,
+         // using all remaining dimensions within -> size of subcommunicator should shrink -> last level
+         // are only the neighbors in the last dimension of the order
+         for (int i = this->dimension - 1; i > curr; --i)
+         {
+            // compute the key by using the linear index transformation from (d-1)D to 1D,
+            // => index [in dimension d] = location[0] + location[1] * limit[0] + ...
+            // to comply to the staggered grid, the size of the sub-communicator decreases
+            // dimensions of sub-communicators:
+            // ( dim-1, dim-2, ..., 1)
+            if (i == d)
+               continue;
+            int tmp = location[order[i]];
+            for (auto t = this->dimension - 1; t > curr; ++t)
+               tmp *= limit[order[t]];
+            key += tmp;
+         }
+
+         if (curr == (this->dimension - 1))
+         {
+            // create subcommunicators with MPI_Comm_split
+            MPI_Comm_split(this->globalComm, color, key, communicators.data() + d);
+         }
+         else
+         {
+            // no communicators for neighbor - neighbor exchange (too many communicators
+            // when using many processes)
+            communicators.at(curr) = MPI_COMM_NULL;
+         }
+
+         ++curr;
+      }
+   }
+
+   template <class T, class W, ALL_ITERATIVE_SCHEME SCHEME>
+   void Iterative_LB<T, W, SCHEME>::balance(int step)
+   {
+      //
+   }
+
+   template <class T, class W, ALL_ITERATIVE_SCHEME SCHEME>
+   std::vector<int> &Iterative_LB<T, W, SCHEME>::getNeighbors()
+   {
+      return neighbors;
+   }
+
+   /// @brief  Method to find neighbors in a TENSOR-decomposition. Needs to
+   ///         be called only once, since the neighbor topology never changes.
+   /// @tparam T
+   /// @tparam W
+   /// @tparam SCHEME
+   template <class T, class W, ALL_ITERATIVE_SCHEME SCHEME>
+   void Iterative_LB<T, W, SCHEME>::findNeighborsTensor()
+   {
+
+      auto mapSize = std::accumulate(limit.begin(), limit.end(), 1, std::multiplies<int>());
+
+      // dimension base vectors of dimension dimension + anchor point + local rank
+      std::vector<T> localMap(((this->dimension + 1) * this->dimension) + 1);
+      std::vector<T> map(mapSize * ((this->dimension + 1) * this->dimension) + 1);
+
+      int location1d = 0;
+      int tmp = 1;
+      for (int d = 0; d < this->dimension; ++d)
+      {
+         location1d += location[order[d] * tmp];
+         tmp *= limit[order[d]];
+      }
+
+      // local rank
+      localMap.at(location1d * (this->dimension + 1) * this->dimension) = this->localRank;
+      // anchor point
+      for (int d = 0; d < this->dimension; ++d)
+      {
+         localMap.at(location1d * (this->dimension + 1) * this->dimension + 1 + d) = anchor.at(d);
+      }
+      for (int dBV = 0; dBV < this->dimension; ++dBV)
+      {
+         for (int d = 0; d < this->dimension; ++d)
+         {
+            localMap.at(location1d * (this->dimension + 1) * this->dimension + (dBV + 1) * this->dimension + 1 + d) = base.at(dBV).at(d);
+         }
+      }
+
+      // collect each local map to a global map
+      MPI_Allgather(localMap.data(), localMap.size(), MPIDataTypeT, map, localMap.size(), MPIDataTypeT, this->globalComm);
+
+      for (auto dim : order)
+      {
+      }
+      // TENSOR is easier, since per definition TENSOR always requires a regular
+      // cartesian grid of form nx * ny * nz, i.e. no changes of number of processes
+      // in a column / plane
+   }
+
+   /// @brief  Method to find neighbors in a TENSOR-decomposition. Needs to
+   ///         be called only once, since the neighbor topology never changes.
+   /// @tparam T
+   /// @tparam W
+   /// @tparam SCHEME
+   template <class T, class W, ALL_ITERATIVE_SCHEME SCHEME>
+   void Iterative_LB<T, W, SCHEME>::findNeighborsTensor()
+   {
+
+      auto mapSize = std::accumulate(limit.begin(), limit.end(), 1, std::multiplies<int>());
+
+      // dimension base vectors of dimension dimension + anchor point + local rank
+      std::vector<T> localMap(((this->dimension + 1) * this->dimension) + 1);
+      std::vector<T> map(mapSize * ((this->dimension + 1) * this->dimension) + 1);
+
+      int location1d = 0;
+      int tmp = 1;
+      for (int d = 0; d < this->dimension; ++d)
+      {
+         location1d += location[order[d] * tmp];
+         tmp *= limit[order[d]];
+      }
+
+      // local rank
+      localMap.at(location1d * (this->dimension + 1) * this->dimension) = this->localRank;
+      // anchor point
+      for (int d = 0; d < this->dimension; ++d)
+      {
+         localMap.at(location1d * (this->dimension + 1) * this->dimension + 1 + d) = anchor.at(d);
+      }
+      for (int dBV = 0; dBV < this->dimension; ++dBV)
+      {
+         for (int d = 0; d < this->dimension; ++d)
+         {
+            localMap.at(location1d * (this->dimension + 1) * this->dimension + (dBV + 1) * this->dimension + 1 + d) = base.at(dBV).at(d);
+         }
+      }
+
+      // collect each local map to a global map
+      MPI_Allgather(localMap.data(), localMap.size(), MPIDataTypeT, map, localMap.size(), MPIDataTypeT, this->globalComm);
+
+      for (auto dim : order)
+      {
+      }
+      // TENSOR is easier, since per definition TENSOR always requires a regular
+      // cartesian grid of form nx * ny * nz, i.e. no changes of number of processes
+      // in a column / plane
+   }
+
+   /// @brief  Method to find neighbors in a TENSOR-decomposition. Needs to
+   ///         be called only once, since the neighbor topology never changes.
+   /// @tparam T
+   /// @tparam W
+   /// @tparam SCHEME
+   template <class T, class W, ALL_ITERATIVE_SCHEME SCHEME>
+   void Iterative_LB<T, W, SCHEME>::findNeighborsStaggered()
+   {
+
+      auto mapSize = std::accumulate(limit.begin(), limit.end(), 1, std::multiplies<int>());
+
+      // dimension base vectors of dimension dimension + anchor point + local rank
+      std::vector<T> localMap(((this->dimension + 1) * this->dimension) + 1);
+      std::vector<T> map(mapSize * ((this->dimension + 1) * this->dimension) + 1);
+
+      int location1d = 0;
+      int tmp = 1;
+      for (int d = 0; d < this->dimension; ++d)
+      {
+         location1d += location[order[d] * tmp];
+         tmp *= limit[order[d]];
+      }
+
+      // local rank
+      localMap.at(location1d * (this->dimension + 1) * this->dimension) = this->localRank;
+      // anchor point
+      for (int d = 0; d < this->dimension; ++d)
+      {
+         localMap.at(location1d * (this->dimension + 1) * this->dimension + 1 + d) = anchor.at(d);
+      }
+      for (int dBV = 0; dBV < this->dimension; ++dBV)
+      {
+         for (int d = 0; d < this->dimension; ++d)
+         {
+            localMap.at(location1d * (this->dimension + 1) * this->dimension + (dBV + 1) * this->dimension + 1 + d) = base.at(dBV).at(d);
+         }
+      }
+
+      // collect each local map to a global map
+      MPI_Allgather(localMap.data(), localMap.size(), MPIDataTypeT, map, localMap.size(), MPIDataTypeT, this->globalComm);
+
+      for (auto dim : order)
+      {
+      }
+      // TENSOR is easier, since per definition TENSOR always requires a regular
+      // cartesian grid of form nx * ny * nz, i.e. no changes of number of processes
+      // in a column / plane
+   }
+
+   /// @brief  Method to find neighbors in a TENSOR-decomposition. Needs to
+   ///         be called only once, since the neighbor topology never changes.
+   /// @tparam T
+   /// @tparam W
+   /// @tparam SCHEME
+   template <class T, class W, ALL_ITERATIVE_SCHEME SCHEME>
+   void Iterative_LB<T, W, SCHEME>::findNeighborsTensor()
+   {
+      auto mapSize = std::accumulate(limit.begin(), limit.end(), 1, std::multiplies<int>());
+
+      // dimension base vectors of dimension dimension + anchor point + local rank
+      std::vector<T> localMap(((this->dimension + 1) * this->dimension) + 1);
+      std::vector<T> map(mapSize * ((this->dimension + 1) * this->dimension) + 1);
+   }
+}
+#endif
\ No newline at end of file