diff --git a/example/ALL_test.cpp b/example/ALL_test.cpp
index 045f7cdfbdc95d0272ddaebd3f2ca43cba613d92..517c6ef0c014c6dd30202a2e4aeab32816c1358c 100644
--- a/example/ALL_test.cpp
+++ b/example/ALL_test.cpp
@@ -35,6 +35,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #include <random>
 #include <stdlib.h>
 #include <iostream>
+#include <iomanip>
 #include <exception>
 #include <fstream>
 #include <sstream>
@@ -57,7 +58,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #define N_GENERATE 1000 
 #define SEED 123789456u
 #define N_LOOP 500 
-#define OUTPUT_INTV 1 
+#define OUTPUT_INTV 100 
 #define MAX_NEIG 1024
 
 #ifdef ALL_VORONOI
@@ -111,14 +112,63 @@ void generate_points(std::vector<ALL_Point<double>>& points,
     }
 }
 
+/****************************************************************
+ *
+ * input routine for example files
+ *
+ * format of MPI I/O files:
+ *
+ *   n_proc int         : offset in point data
+ *   n_proc int         : number of points on domain
+ *   5*n_point double   : point data
+****************************************************************/
+
 void read_points(std::vector<ALL_Point<double>>& points,
                  std::vector<double> l,
                  std::vector<double> u,
                  int& n_points,
                  char* filename,
                  int dimension,
-                 int rank)
+                 int rank,
+                 MPI_Comm comm)
 {
+    MPI_File file;
+    int err = MPI_File_open(comm, filename, MPI_MODE_RDONLY, MPI_INFO_NULL, &file);
+
+    n_points = 0;
+
+    int n_procs;
+    int offset;
+    MPI_Comm_size(comm,&n_procs);
+
+    if (err)
+    {
+        if (rank == 0) std::cout << "File could not be opened: " << filename << std::endl;
+        MPI_Abort(MPI_COMM_WORLD,-1);
+    }
+
+    // read offset from file
+    MPI_File_read_at(file,(MPI_Offset)(rank*sizeof(int)),&offset,1,MPI_INT,MPI_STATUS_IGNORE);
+    // read number of points from file
+    MPI_File_read_at(file,(MPI_Offset)((n_procs + rank)*sizeof(int)),&n_points,1,MPI_INT,MPI_STATUS_IGNORE);
+
+    double values[5];
+    for (int i = 0; i < n_points; ++i)
+    {
+        MPI_File_read_at(file,
+                         (MPI_Offset)(5*(offset + i)*sizeof(double) + 2 * n_procs * sizeof(int)),
+                         values,
+                         5,
+                         MPI_DOUBLE,
+                         MPI_STATUS_IGNORE);
+        ALL_Point<double> p(dimension,&values[1],values[4]);
+        points.push_back(p);
+    }
+
+    MPI_File_close(&file);
+
+    /*
+
     std::ifstream ifile;
     ifile.open(filename);
 
@@ -150,6 +200,7 @@ void read_points(std::vector<ALL_Point<double>>& points,
     }
 
     ifile.close();
+    */
 }
 
 #ifdef ALL_VTK_OUTPUT
@@ -248,7 +299,7 @@ int main(int argc, char** argv)
         bool weighted_points = false;
         char* filename = NULL;
         double gamma = 16.0;
-
+        int output_step = 0;
         int global_dim[sys_dim];
         for (int d = 0; d < sys_dim; ++d)
             global_dim[d] = 0;
@@ -491,19 +542,22 @@ int main(int argc, char** argv)
         // generate points on domains
         int n_points = N_GENERATE;
         int max_particles = 1;
+        if (local_rank == 0) std::cout << "creating / reading point data" << std::endl;
         if (argc <= 4)
         {
             generate_points(points,l,u,coords,global_dim,sys_dim,n_points,local_rank);
         }
         else
         {
-            read_points(points,l,u,n_points,filename,sys_dim,local_rank);
+            read_points(points,l,u,n_points,filename,sys_dim,local_rank, cart_comm);
         }
         double* transfer;
         double* recv;
 
         MPI_Allreduce(&n_points,&max_particles,1,MPI_INT,MPI_MAX,cart_comm);
 
+        //if (local_rank == 0) std::cout << "maximum number of points on any process: " << max_particles << std::endl;
+
         recv = new double[27 * (sys_dim+1) * max_particles];
         transfer = new double[27 * (sys_dim+1) * max_particles];
 
@@ -549,28 +603,31 @@ int main(int argc, char** argv)
         MPI_Allreduce(&n_local,&total_points,1,MPI_DOUBLE,MPI_SUM,cart_comm);
 
         // output of borders / contents
-        for (int i = 0; i < n_ranks; ++i)
+        if (n_ranks < 216)
         {
-            if (local_rank == i)
+            for (int i = 0; i < n_ranks; ++i)
             {
-                std::ofstream of;
-                of.open("domain_data.dat", std::ios::out | std::ios::app);
-                of << 0 << " " << local_rank << " ";
-
-                for (int j = 0; j < sys_dim; ++j)
+                if (local_rank == i)
                 {
-                    of << " " << local_coords[j] << " " << lp.x(j) << " " << up.x(j) << " ";
-                }
+                    std::ofstream of;
+                    of.open("domain_data.dat", std::ios::out | std::ios::app);
+                    of << 0 << " " << local_rank << " ";
 
-                of << " " << n_local << " ";
-                
-                of << std::endl;
-                if (i == n_ranks - 1) of << std::endl;
-                of.close();
-                MPI_Barrier(cart_comm);
+                    for (int j = 0; j < sys_dim; ++j)
+                    {
+                        of << " " << local_coords[j] << " " << lp.x(j) << " " << up.x(j) << " ";
+                    }
+
+                    of << " " << n_local << " ";
+                    
+                    of << std::endl;
+                    if (i == n_ranks - 1) of << std::endl;
+                    of.close();
+                    MPI_Barrier(cart_comm);
+                }
+                else
+                    MPI_Barrier(cart_comm);
             }
-            else
-                MPI_Barrier(cart_comm);
         }
 
 #ifdef ALL_VORONOI       
@@ -878,12 +935,17 @@ int main(int argc, char** argv)
             }
        }
 #endif        
-
+        double limit_efficiency = 0.5;
         // create ALL object
         ALL<double,double> lb_obj(sys_dim,vertices,gamma);
         for (int i_loop = 0; i_loop < max_loop; ++i_loop)
         {
             MPI_Barrier(cart_comm);
+            if (d_ratio < limit_efficiency)
+            {
+                gamma /= 2.0;
+                limit_efficiency /= 2.0;
+            }
             if (local_rank == 0) std::cout << "loop " << i_loop << ": " << std::endl;
             double work = 0.0;
            
@@ -1064,10 +1126,15 @@ int main(int argc, char** argv)
                     lb_obj.get_neighbors(chosen_method,neighbors);
                     lb_obj.get_neighbors(chosen_method,&n_neighbors);
 
+
                     offset_neig[0] = 0;
                     offset_neig[1] = n_neighbors[0];
                     for (int i = 0; i < sys_dim; ++i)
                     {
+
+                        //MPI_Barrier(cart_comm);
+                        //if (local_rank == 0) std::cout << "filling buffers with remote points, dim " << i << std::endl;
+
                         for (int j = 0; j < 2; ++j)
                         {
                             n_transfer[j] = 0;
@@ -1115,16 +1182,47 @@ int main(int argc, char** argv)
                                 }
                             }
                         }
-                        for (auto p = points.begin(); p != points.end(); ++p)
+
+                        //MPI_Barrier(cart_comm);
+                        //if (local_rank == 0) std::cout << "cleaning point vector, dim " << i << std::endl;
+
+                        auto rem_it = std::remove_if(
+                                        points.begin(),
+                                        points.end(),
+                                        [vertices,i](ALL_Point<double> p)
+                                        {
+                                            return 
+                                                p.x(i) < vertices.at(0).x(i) ||
+                                                p.x(i) >= vertices.at(1).x(i);
+                                        });
+                        points.erase(rem_it,points.end());
+                        n_points = points.size();
+                         
+                        /*
+                        for (int j = n_points-1; j >= 0; --j)
                         {
-                            if (p->x(i) < vertices.at(0).x(i) || p->x(i) >= vertices.at(1).x(i))
+                            if
+                            (
+                                points.at(j).x(i) < vertices.at(0).x(i) || 
+                                points.at(j).x(i) >= vertices.at(1).x(i)
+                            )
                             {
-                                points.erase(p);
-                                n_points--;
-                                p--;
+                                if (j == n_points - 1)
+                                {
+                                    n_points--;
+                                }
+                                else
+                                {
+                                    std::swap(points.at(j),points.at(n_points-1));
+                                    n_points--;
+                                }
                             }
                         }
-                        MPI_Status status;
+                        points.resize(n_points);
+                        */
+
+                        //MPI_Barrier(cart_comm);
+                        //if (local_rank == 0) std::cout << "exchanging number of sent points, dim " << i << std::endl;
 
                         // TODO -> better estimation than max 1024 neighbors ...
                         MPI_Request sreq_r[MAX_NEIG], rreq_r[MAX_NEIG];
@@ -1169,6 +1267,9 @@ int main(int argc, char** argv)
                         MPI_Waitall(n_neighbors[2*i],rreq_l,rsstatus);
                         MPI_Waitall(n_neighbors[2*i+1],rreq_r,rrstatus);
 
+                        //MPI_Barrier(cart_comm);
+                        //if (local_rank == 0) std::cout << "exchanging point data, dim " << i << std::endl;
+
                         int offset_l = 0;
                         // send particles to corresponding neighbor
                         for (int d = 0; d < n_neighbors[2*i]; ++d)
@@ -1192,6 +1293,7 @@ int main(int argc, char** argv)
                         MPI_Waitall(n_neighbors[2*i],rreq_l,rsstatus);
                         MPI_Waitall(n_neighbors[2*i+1],rreq_r,rrstatus);
 
+                        //if (local_rank == 0) std::cout << "adding received point data, dim " << i << std::endl;
                         for (int j = 0; j < offset_l; ++j)
                         {
                             ALL_Point<double> p(sys_dim,&recv[j * (sys_dim+1)],recv[j * (sys_dim+1) + sys_dim]);
@@ -1500,7 +1602,7 @@ int main(int argc, char** argv)
                     }
 
 #else
-                    std::cout << "Currently no UNSTRUCTURED test without VTK!" << std::endl;
+                    if (local_rank == 0) std::cout << "Currently no UNSTRUCTURED test without VTK!" << std::endl;
                     MPI_Abort(MPI_COMM_WORLD,-1);                    
 #endif                    
                 }
@@ -1701,18 +1803,21 @@ int main(int argc, char** argv)
                 break;
             }
 
-            if (i_loop % OUTPUT_INTV == 0)
+            if (i_loop <= 100 || i_loop % OUTPUT_INTV == 0)
             {
-
 #ifdef ALL_VORONOI
                 if (chosen_method != ALL_LB_t::VORONOI)
                 {
 #endif                
 #ifdef ALL_VTK_OUTPUT
-                    lb_obj.print_vtk_outlines(i_loop/OUTPUT_INTV);    
-                    lb_obj.print_vtk_vertices(i_loop/OUTPUT_INTV);
-                    print_points(points,i_loop/OUTPUT_INTV,chosen_method,cart_comm);
-#endif
+                    //if (local_rank == 0) std::cout << "creating vtk outlines output" << std::endl;
+                    if (chosen_method != ALL_LB_t::UNSTRUCTURED) lb_obj.print_vtk_outlines(output_step);    
+                    //if (local_rank == 0) std::cout << "creating vtk vertices output" << std::endl;
+                    if (chosen_method == ALL_LB_t::UNSTRUCTURED) lb_obj.print_vtk_vertices(output_step);
+                    //if (local_rank == 0) std::cout << "creating points output" << std::endl;
+                    if (i_loop == 0) print_points(points,i_loop/OUTPUT_INTV,chosen_method,cart_comm);
+#endif              
+                    output_step++;
 #ifdef ALL_VORONOI
                 }
                 else
@@ -1742,7 +1847,7 @@ int main(int argc, char** argv)
 
             if ( std::abs(n_total_points - total_points) > 1e-6)
             {
-                std::cout << n_total_points << " != " << total_points << std::endl;
+                std::cout << std::setprecision(14) <<  n_total_points << " != " << total_points << std::endl;
                 return(-1);
             }
 
@@ -1826,33 +1931,35 @@ int main(int argc, char** argv)
         }
 
         // output of borders / contents
-        for (int i = 0; i < n_ranks; ++i)
+        if (n_ranks < 216)
         {
-            if (local_rank == i)
+            for (int i = 0; i < n_ranks; ++i)
             {
-                std::ofstream of;
-                if (!weighted_points)
-                    of.open("domain_data.dat", std::ios::out | std::ios::app);
-                else
-                    of.open("domain_data_w.dat", std::ios::out | std::ios::app);
-                of << 0 << " " << local_rank << " ";
-
-                for (int j = 0; j < sys_dim; ++j)
+                if (local_rank == i)
                 {
-                    of << " " << local_coords[j] << " " << lp.x(j) << " " << up.x(j) << " ";
-                }
+                    std::ofstream of;
+                    if (!weighted_points)
+                        of.open("domain_data.dat", std::ios::out | std::ios::app);
+                    else
+                        of.open("domain_data_w.dat", std::ios::out | std::ios::app);
+                    of << 0 << " " << local_rank << " ";
 
-                of << " " << n_local << " ";
-                
-                of << std::endl;
-                if (i == n_ranks - 1) of << std::endl;
-                of.close();
-                MPI_Barrier(cart_comm);
+                    for (int j = 0; j < sys_dim; ++j)
+                    {
+                        of << " " << local_coords[j] << " " << lp.x(j) << " " << up.x(j) << " ";
+                    }
+
+                    of << " " << n_local << " ";
+                    
+                    of << std::endl;
+                    if (i == n_ranks - 1) of << std::endl;
+                    of.close();
+                    MPI_Barrier(cart_comm);
+                }
+                else
+                    MPI_Barrier(cart_comm);
             }
-            else
-                MPI_Barrier(cart_comm);
         }
-
         delete transfer;
 
         if (local_rank == 0)